Aim: To test the association between age-related macular degeneration (AMD) with epigenetic age.
Pre-processing script adapted from Chaini Konwar.
library(GEOquery)
library(RCurl)
library(GEOmetadb)
library(dendextend)
library(ArrayExpress)
library(methylumi)
library(lumi)
library(lattice)
library(gplots)
library(RColorBrewer)
library(limma)
library(ROC)
library(matrixStats)
library(reshape)
library(sva)
library(grid)
library(gridExtra)
library(ape)
library(Hmisc)
library(RCurl)
library(wateRmelon)
library(minfiData)
library(minfi)
library(robustHD)
library(ewastools)
library(omicsPrint)
library(doParallel)
library(jcolors)
library(plyr)
library(tidyverse)
library(ggrepel)
library(ggpubr)
library(IlluminaHumanMethylationEPICanno.ilm10b2.hg19)
library(IlluminaHumanMethylationEPICmanifest)
library(IlluminaHumanMethylation450kanno.ilmn12.hg19)
library(IlluminaHumanMethylation450kmanifest)
library(FlowSorted.Blood.EPIC)
library(FlowSorted.Blood.450k)
library(FlowSorted.CordBloodCombined.450k)
library(Biobase)
library(data.table)
library(factoextra)
library(Metrics)
library(quantro)
library(impute)
setwd("~/KoborLab/kobor_space/kendrix/macular_degeneration/")The data used here comes from Whole-genome methylation profiling of the retinal pigment epithelium of individuals with age-related macular degeneration reveals differential methylation of the SKI, GTF2H4, and TNXB genes by Porter et al..
The authors made the dataset publicly available in ArrayExpress with the Accession number: E-MTAB-7183.
The dataset consists of ocular tissue (pigmented layer of retina) from 25 AMD samples (21 level 2 AMD and 4 level 3 AMD) and 19 controls.
The age ranges from 50 years old to 89 years old, and the sex distribution is as follows: Male (27), Female (17).
#Explore metadata.
AMD_meta <- read.table("~/KoborLab/kobor_space/kendrix/macular_degeneration/data/E-MTAB-7183_meta.txt", sep = "\t", header = TRUE)
colnames(AMD_meta)[1] <- "Sample_Name"
colnames(AMD_meta)[3] <- "Sex"
colnames(AMD_meta)[4] <- "Age"
colnames(AMD_meta)[6] <- "Tissue"
colnames(AMD_meta)[7] <- "Disease_state"
colnames(AMD_meta)[17] <- "Assay_name"
AMD_meta$Sex <- gsub("male", "M", gsub("female", "F", AMD_meta$Sex)) #Changed for easier comparison with minfi sex prediction.
#Create samplesheet.
AMD_samplesheet <- AMD_meta[, c("Sample_Name", "Disease_state", "Assay_name", "Sex", "Age", "Tissue")]
colnames(AMD_samplesheet)[2] <- "Sample_Group"
AMD_samplesheet$Assay_name <- gsub("_Grn", "", gsub("_Red", "", AMD_samplesheet$Assay_name))
AMD_samplesheet <- separate(AMD_samplesheet, col = "Assay_name", sep = "_", into = c("Sentrix_ID", "Sentrix_Position"))
duplicated(AMD_samplesheet) #The samplesheet contains duplicated samples.
AMD_samplesheet <- AMD_samplesheet[duplicated(AMD_samplesheet),] #Remove duplicated samples. write.csv(AMD_samplesheet, file = "~/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/AMD_samplesheet.csv", row.names = FALSE)path = "~/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project"
targets <- read.metharray.sheet(path) #Tells R to look for sample sheet within the folder.
baseDir <- system.file(path, package = "minfiData")
baseDir #baseDir determines the array chip position and ID.
sub(baseDir, "", targets$Basename) #The class of RGSet is an RGChannelSet object. This is the initial object of a minfi analysis that contains the raw intensities in the green and red channels. Note that this object contains the intensities of the internal control probes as well.
#Create Extended RGSet object.
AMD_ExtendedRGSet <- read.metharray.exp(targets = targets, extended = TRUE, verbose = TRUE)
AMD_ExtendedRGSetsampleNames(AMD_ExtendedRGSet) == paste0(AMD_samplesheet$Sentrix_ID, "_", AMD_samplesheet$Sentrix_Position) #Check order of sampleNames in RGSet and samplesheet to change sampleNames in RGSet. Proceed ONLY if all is TRUE. ## [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [16] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [31] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
sampleNames(AMD_ExtendedRGSet) <- AMD_samplesheet$Sample_Name #Change sampleNames of AMD_ExtendedRGSet.
identical(sampleNames(AMD_ExtendedRGSet), rownames(pData(AMD_ExtendedRGSet))) #Confirm sample orders in pData.## [1] TRUE
identical(sampleNames(AMD_ExtendedRGSet), colnames(getBeta(AMD_ExtendedRGSet))) #Confirm sample orders in beta matrix. ## [1] TRUE
save(AMD_ExtendedRGSet, file = "~/KoborLab/kobor_space/kendrix/macular_degeneration/data/AMD_ExtendedRGSet.RData")#Create ewastools sampleInfo sheet.
AMD_sampleInfo <- AMD_meta[,c("Sample_Name", "Disease_state", "Assay_name", "Sex", "Age", "Tissue")]
AMD_sampleInfo$Assay_name <- gsub("_Red", "", gsub("_Grn", "", AMD_sampleInfo$Assay_name))
AMD_sampleInfo <- unique(AMD_sampleInfo) #Remove duplicates.
path <- "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/"
AMD_sampleInfo$Assay_name <- paste0(path, AMD_sampleInfo$Assay_name)
meth <- read_idats(AMD_sampleInfo$Assay_name, quiet = TRUE)## [1] 622399
rm(AMD_meta) #Remove original meta file. save(AMD_sampleInfo, meth, file = "~/KoborLab/kobor_space/kendrix/macular_degeneration/data/AMD_ewastools_object.RData")load("~/KoborLab/kobor_space/kendrix/macular_degeneration/data/AMD_ExtendedRGSet.RData")
load("~/KoborLab/kobor_space/kendrix/macular_degeneration/data/AMD_ewastools_object.RData")
AMD_samplesheet <- read.csv("~/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/AMD_samplesheet.csv", header = TRUE)It is worth checking the quality of the samples to see if they are true signals and are not conflated with background noise. One way to do so is by determining the detection p-values of the samples to parse out true methylation signals. Detection p-values by definition are measures that differentiate sample signal from background noise (which is estimated using the negative probes of the array). By default, the threshold for significant detection p-value is set at 0.01. Samples that are above the p-value threshold are considered statistically poor and should be removed (i.e. samples with high detection p-value should be discarded because they were detected with low signal-to-noise ratio of fluorescence intensities).
The minfi package provides a function, detectionP() to determine detection p-values for each methylation region across samples using the negative control probes in the array that are designed to NOT target the human genome. The probe sequences are propriety and are suggested to feature very low intensities (Heiss & Just (2019)).
NOTE: No samples above detection p-value of 0.01. All samples passed.
detp_minfi <- minfi::detectionP(AMD_ExtendedRGSet) #Detection p-value distinguishes signal from background noise with a single cut-off.
head(detp_minfi)[,1:5]## Sample 1 Sample 10 Sample 11 Sample 12 Sample 13
## cg00050873 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
## cg00212031 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
## cg00213748 6.491336e-46 7.633429e-54 3.984433e-32 4.143616e-56 1.782518e-25
## cg00214611 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
## cg00455876 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
## cg01707559 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
#Examine mean detection p-values across all samples to identify any failed samples.
plot(colMeans(detp_minfi), ylim = c(0.00002, 0.05), xaxt = 'n', ann = FALSE, pch = 20, col = "black", cex = 1) +
mtext(side = 1, line = 0.5, "Samples", font = 1, cex = 1) +
mtext(side = 2, line = 2, "Mean detection p-values", font = 1, cex = 1) +
abline(h = 0.01, col = "red") +
text(colMeans(detp_minfi), labels = AMD_ExtendedRGSet$Sample_Name, cex = 0.5, font = 2, pos = 2)## integer(0)
Heiss and Just recommends using the non-specific fluorescence intensities to estimate the background signal, which is a more accurate and stringent method without needing to set an extreme detection p-value cut-off. They imply that this method not only “protect against false-positive findings” but “against false-negative findings as well”. They show that their method calls almost all Y-chromosome probes among males, but classifies most Y-chromosome probes in females as undetected using the 0.01 cut-off, which is the intended outcome as females possess XX chromosomes and should not have any signal detected in Y-chromosomes. They also show similar results detected using the negative control probes, but with a more extreme cut-off and more samples filtered (> 1e-40).
NOTE: No samples above detection p-value of 0.01. All samples passed.
detp_ewastools <- ewastools::detectionP(meth) #Detection p-value distinguishes signal from background noise with a single cut-off.
detp <- as.data.frame(detp_ewastools$detP)
head(detp)[,1:5]## V1 V2 V3 V4 V5
## 1 5.361168e-88 5.204288e-92 7.375519e-131 4.212671e-123 1.016425e-37
## 2 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
## 3 7.482801e-20 7.280083e-19 6.448259e-27 5.011001e-19 1.696454e-08
## 4 6.928902e-36 1.822559e-27 1.463483e-40 2.730796e-38 1.150689e-15
## 5 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
## 6 0.000000e+00 4.857922e-250 0.000000e+00 0.000000e+00 0.000000e+00
colnames(detp) <- colnames(getBeta(AMD_ExtendedRGSet))
rownames(detp) <- meth$manifest$probe_id
#Examine mean detection p-values across all samples to identify any failed samples.
plot(colMeans(detp, na.rm = TRUE), ylim = c(0.00002, 0.05), xaxt = 'n', ann = FALSE, pch = 20, col = "black", cex = 1) +
mtext(side = 1, line = 0.5, "Samples", font = 1, cex = 1) +
mtext(side = 2, line = 2, "Mean detection p-values", font = 1, cex = 1) +
abline(h = 0.01, col = "red") +
text(colMeans(detp, na.rm = TRUE), labels = colnames(detp), cex = 0.5, font = 2, pos = 2)## integer(0)
For each probe sequence in the 450K array, a median of 14 beads is randomly distributed on the array. Each of these beads contains hundreds of thousands of oligonucleotides. This provides a unique set of internal technical replication on each array. This step is done to remove probes that are not represented by a minimum of 3 beads on the array, which is important to correct for positional effects (i.e. the effects where the same sample in different physical positions on the array could be measured as different methylation levels).
Probes that are not represented by a minimum of 3 beads on the array are designated as NA with the beadcount() function.
NOTE: All samples passed.
#Calculate the number of samples with bead count <3 for each probe in a matrix of bead count values.
bead <- beadcount(AMD_ExtendedRGSet)
colnames(bead) <- gsub('X', '', colnames(bead))
AMD_samplesheet <- AMD_samplesheet %>% mutate(Beadcount = colSums(is.na(bead)))
AMD_samplesheet %>%
mutate(Sample_Name = factor(as.character(Sample_Name), levels = Sample_Name)) %>%
ggplot(aes(x = Sample_Name, y = Beadcount)) +
geom_point(alpha = 0.7, color = 'black') +
geom_hline(yintercept = 0.01*nrow(bead), linetype = 'dashed', color = 'green') +
geom_text(aes(x = 0, y = 0.01*nrow(bead)),
label = '1%', vjust = -0.5, hjust = -0.5, color = 'green')+
scale_y_continuous(limits = c(0, 12500), breaks = seq(0, 12500, 2500)) +
labs(x = 'Samples', y = '', title = '# Samples with probes with bead count < 3') +
theme_classic() +
theme(axis.text.x = element_blank()) save(detp_minfi, bead, file = "~/KoborLab/kobor_space/kendrix/macular_degeneration/data/AMD_detp_beadcount_QC.RData")minfi package provides a simple quality control plot that uses the log median intensity in both the methylated (M) and unmethylated (U) channels. When plotting these two medians against each other, it has been observed that good samples cluster together with higher median intensities, while failed samples tend to separate and have lower median intensities.
NOTE: All samples passed.
AMD_MSet <- preprocessRaw(AMD_ExtendedRGSet) #Get MSet object from RGSet.
head(getMeth(AMD_MSet)[,1:3])## Sample 1 Sample 10 Sample 11
## cg00050873 13702 13026 14553
## cg00212031 459 753 391
## cg00213748 1442 1763 1505
## cg00214611 224 434 119
## cg00455876 3911 4834 5463
## cg01707559 795 802 759
head(getUnmeth(AMD_MSet)[,1:3])## Sample 1 Sample 10 Sample 11
## cg00050873 1821 2141 2001
## cg00212031 5111 5494 5486
## cg00213748 257 153 311
## cg00214611 5876 6057 5677
## cg00455876 1178 1320 1431
## cg01707559 7699 8679 8710
AMD_QC <- getQC(AMD_MSet)
head(AMD_QC)## DataFrame with 6 rows and 2 columns
## mMed uMed
## <numeric> <numeric>
## Sample 1 10.6375305515253 11.3415188232452
## Sample 10 11.0188956211216 11.4361909954814
## Sample 11 11.1478408936723 11.5401280385821
## Sample 12 10.8811139606751 11.4782640315817
## Sample 13 10.8049376720519 11.2801907922169
## Sample 14 10.9708249013626 11.3426302986784
plotQC(AMD_QC)Quality control metrics are examined to determine the success of the bisulphite conversion and subsequent array hybridisation. This check uses the Illumina’s 636 control probes to assess technical parameters including array staining, extension, hybridization, target removal, specificity, and bisulfite conversion.
NOTE: All samples passed control metrics.
ctrls <- control_metrics(meth)
# A logical vector of passed/failed is returned by sample_failure() which compares all 17 metrics against the thresholds recommended by Illumina.
AMD_sampleInfo$failed <- as.data.frame(sample_failure(ctrls))
table(AMD_sampleInfo$failed) #If AMD_sampleInfo$failed == FALSE, all samples PASS.##
## FALSE
## 44
failed_control_metrics <- AMD_sampleInfo[AMD_sampleInfo$failed == TRUE, "Sample_Name"] #Check the sample names for the ones that fail control metrics.
failed_control_metrics #No failed samples. ## factor(0)
## 44 Levels: Sample 1 Sample 10 Sample 11 Sample 12 Sample 13 ... Sample 9
#Summary of control metrics.
control_metrics_all <- as.data.frame(ctrls)
control_metrics_all$Sample_Name <- AMD_sampleInfo$Sample_Name
head(control_metrics_all)## Restoration Staining.Green Staining.Red Extension.Green Extension.Red
## 1 0.03074534 165.6038 18.81818 62.96364 13.23746
## 2 0.04477612 128.6500 NA 59.72438 13.03802
## 3 0.04687976 NA NA 65.51228 12.58561
## 4 0.04408138 110.1176 259.83333 58.45902 13.48349
## 5 0.05071562 761.2381 224.86667 76.97196 11.92087
## 6 0.06167536 896.2500 NA 63.78764 11.68353
## Hybridization.High.Medium Hybridization.Medium.Low Target.Removal.1
## 1 1.680178 2.077765 17.88889
## 2 1.663601 2.134143 13.07968
## 3 1.581086 2.200758 12.78210
## 4 1.577832 2.034192 13.29508
## 5 1.593910 2.112461 15.30476
## 6 1.564947 2.074236 12.68093
## Target.Removal.2 Bisulfite.Conversion.I.Green Bisulfite.Conversion.I.Red
## 1 13.089431 19.43561 7.605697
## 2 12.388679 19.55769 6.784644
## 3 9.690265 21.18272 6.929114
## 4 11.544484 21.24806 8.676753
## 5 10.713333 22.11200 10.745140
## 6 8.377892 19.78571 6.975317
## Bisulfite.Conversion.II Specificity.I.Green Specificity.I.Red Specificity.II
## 1 9.193029 10.068966 4.773639 27.63091
## 2 10.328638 10.681818 4.826513 28.79827
## 3 11.394419 12.701695 4.674271 27.03368
## 4 15.401290 9.255245 3.654054 32.10127
## 5 5.925329 11.415385 3.245431 25.92169
## 6 10.635226 7.774929 3.172968 27.81843
## Non.polymorphic.Green Non.polymorphic.Red Sample_Name
## 1 8.395745 9.221387 Sample 1
## 2 9.231855 10.183274 Sample 10
## 3 10.406780 10.298646 Sample 11
## 4 8.893971 10.983957 Sample 12
## 5 9.436782 9.711957 Sample 13
## 6 8.233456 9.702459 Sample 14
sapply(control_metrics_all, function(x) sum(is.na(x)))## Restoration Staining.Green
## 0 15
## Staining.Red Extension.Green
## 25 0
## Extension.Red Hybridization.High.Medium
## 0 0
## Hybridization.Medium.Low Target.Removal.1
## 0 0
## Target.Removal.2 Bisulfite.Conversion.I.Green
## 0 0
## Bisulfite.Conversion.I.Red Bisulfite.Conversion.II
## 0 0
## Specificity.I.Green Specificity.I.Red
## 0 0
## Specificity.II Non.polymorphic.Green
## 0 0
## Non.polymorphic.Red Sample_Name
## 0 0
stripchart(ctrls$`Bisulfite Conversion II`, method = "jitter", pch = 4, xlab = 'Bisulfite Conversion II', xlim = c(0,20)) +
abline(v = 1, col = 2, lty = 3) +
text(ctrls$`Bisulfite Conversion II`[ctrls$`Bisulfite Conversion II` < 1], 1.2, labels = ctrls$Sample_Name[ctrls$`Bisulfite Conversion II` < 1], srt = 45)## integer(0)
or
controlStripPlot(AMD_ExtendedRGSet, controls = c("BISULFITE CONVERSION I", "BISULFITE CONVERSION II"))The 450K BeadChip also features 65 control probes which assay highly-polymorphic single nucleotide polymorphisms (SNPs) rather than DNA methylation. These are included on the array to allow sample quality control to check for relatedness between individuals and enable the detection of potential sample mix-ups. The signal from these probes is expected to cluster into three distinct groups (representing the heterozygous and two homozygous groups). The snp_outliers function however computes the average log odds from the 65 posterior probabilities from a mixture model to capture how irregular the SNP betas are, i.e. how much they deviate from the ideal trimodal distribution. Although these are not DNA methylation signals, they could be used to provide an indication of the degree of technical variance between samples.
NOTE: All samples passed.
#While ewastools implements the LOESS normalization (Heiss and Brenner, 2015), the developers of the package says not use the normalization "as it does little to protect against batch effects but can result in the removal of genuine biological signal". They recommend to adjust for relevant technical covariates in regression models later.
beta <- dont_normalize(meth)
#Pulling SNP probes.
snps <- meth$manifest[probe_type == "rs", index]
snps <- beta[snps,]
#These SNPs are then used as input for call_genotypes(). This function estimates the parameters of a mixed model consisting of three beta distributions representing one heterozygous and the two homozygous genotypes. There is also a fourth component, shown as a uniform distribution that represents outliers. The functions returns posterior probabilities used for soft classification.
#In simple words, we are determining the probability for every SNP whether they belong to 1 of 4 different distributions - 3 of which correspond to the 3 expected genotypes (AA, AB, BB), and the 4th distribution corresponds to outside/in-between these expected genotype distributions. SNPs with a higher probability of belonging to this 4th distribution indicate mixing between more than one genotype.
#Fit mixed model to call genotypes.
genotypes_called <- call_genotypes(snps, learn = T)
#Call genotype clusters.
AMD_samplesheet <- AMD_samplesheet %>%
mutate(genotype_cluster = as.factor(enumerate_sample_donors(genotypes_called)))
#Examine probability outlier.
plot(snp_outliers(genotypes_called) %>% sort, ylab = "SNP Outliers")#Overall distribution of the genotypes.
ewastools:::mxm_(genotypes_called)We see 3 peaks, corresponding to 3 possible genotypes.
#Check the average probability of SNP not belonging to any of the 3 genotypes (coloured by Sex).
AMD_samplesheet <- AMD_samplesheet %>%
mutate(Prob_SNP_outlier = colMeans(genotypes_called$outliers, na.rm = T),
Prob_SNP_outlier_Logodds = snp_outliers(genotypes_called))
ggplot(AMD_samplesheet, aes(x = Sample_Name, y = Prob_SNP_outlier, fill = Sex)) +
geom_point(shape = 21, size = 2.5, alpha = 0.8, col = "black") +
scale_fill_manual(values = c("#bd7b9f", "#2c7dab")) +
labs(x = 'Samples', y = "Probability",
title = 'Average probability of SNP being an outlier') +
theme_bw() +
theme(axis.text.x = element_text(size = 14),
axis.text.y = element_text(size = 14),
axis.title.x = element_text(size = 15, vjust = -0.3),
axis.title.y = element_text(size = 15, vjust = 2),
legend.position = "none") +
scale_y_continuous(limits = c(0, 1), breaks = seq(0, 1, 0.1)) +
scale_x_discrete(breaks = NULL, expand = c(0.02, 0.02))The Y-axis denotes the average probability of SNP being an outlier. No sample appears to be > 0.1 probability, indicating that there is minimal probability of the presence of outlier.
#Look at the raw distribution
snp_betas <- getSnpBeta(AMD_ExtendedRGSet)
snp_betas_melt <- t(snp_betas) %>% as_tibble %>% mutate(Sample_Name = colnames(snp_betas)) %>%
left_join(AMD_samplesheet %>% select(Sample_Name, Sex), by = 'Sample_Name') %>%
gather(key = 'SNP', value = 'Beta', -Sample_Name, -Sex)## Warning: Column `Sample_Name` joining character vector and factor, coercing into
## character vector
ggplot(snp_betas_melt, aes(x = SNP, y = Beta, fill = Sex)) +
geom_point(shape = 21, size = 2.5, alpha = 0.8, col = "black") +
scale_fill_manual(values = c("#bd7b9f", "#2c7dab")) +
labs(x = '59 SNPs') + theme_bw() + theme(axis.text.x = element_blank()) +
theme_bw() +
theme(axis.text.x = element_blank()) +
scale_x_discrete(breaks = NULL, expand = c(0.02, 0.02))omicsPrint (Van Iterson et al. 2018) is a package developed to detect data linkage errors through inspecting sample relations in multiple omics studies. Included with the package is the hm450.manifest.pop.GoNL data, which stores SNP probe information in a GRanges class object. This is then used to create a subset of the beta values for genotyping. The function beta2genotype() then genotypes the observations by measuring homozygous or heterozygous alleles at these SNP probes. Lastly alleleSharing() assesses the relationships between different individuals, which can be unrelated, twins, or identical. The results can then be visualised using the inferRelations() function. In the data with sample relationships, this would be shown in the above graph as green or black clusters (Van Iterson et al. 2018). It is important to carry out this type of visualization before probe-filtering as otherwise the genotyping will be based on very few SNPs.
NOTE: No mismatches found.
data(hm450.manifest.pop.GoNL)
betas <- getBeta(AMD_ExtendedRGSet)
cpgs <- names(hm450.manifest.pop.GoNL[mcols(hm450.manifest.pop.GoNL)$MASK.snp5.EAS])
cpgs <- na.omit(match(cpgs, rownames(betas)))
omicsBetas <- betas[cpgs,]
omicsBetas[1:10, 1:2]## Sample 1 Sample 10
## cg08477687 0.15064562 0.57708628
## cg00645010 0.05583976 0.06016167
## cg11422233 0.07789995 0.07780564
## cg06402284 0.04116505 0.07233649
## cg01551879 0.04928504 0.05166097
## cg20788133 0.66539440 0.68896612
## cg09139287 0.49239130 0.31366692
## cg23100540 0.17080388 0.13082796
## cg23999112 0.70067114 0.74516195
## cg01062849 0.82885375 0.78417470
dnamCalls <- beta2genotype(omicsBetas, assayName = "exprs")
dim(dnamCalls)## [1] 597 44
dnamCalls[1:10, 1:2]## Sample 1 Sample 10
## cg19405842 2 1
## cg01296877 3 1
## cg21783012 1 2
## cg26422465 1 2
## cg00345083 3 2
## cg15075357 2 3
## cg18285337 2 3
## cg02890259 2 3
## cg15600437 2 2
## cg25593194 3 2
omicsData <- alleleSharing(dnamCalls, verbose = TRUE)## Hash relations
## Pruning 597 SNPs ...
## 0 SNPs removed because of low call rate!
## 0 samples removed because too few SNPs called!
## Using 597 polymorphic SNPs to determine allele sharing.
## Running `square` IBS algorithm!
## 45 of 990 (4.55%) ...
mismatches <- inferRelations(omicsData)dim(mismatches)## [1] 0 6
Hierarchical clustergram across all samples cluster similar samples together while samples that are different from all the other samples are pulled down as outliers.
NOTE: No obvious sample that is pulled down as outlier.
#Use SNP probes to infer identity and see how they cluster.
snp_betas <- getSnpBeta(AMD_ExtendedRGSet)
identity_dendo <- dist(t(snp_betas))
clust <- hclust(identity_dendo)
dendo <- as.dendrogram(clust)
dendo %>% dendextend::set("labels_cex", 0.6) %>%
hang.dendrogram %>% plot()The outlyx function takes any beta matrix (preferably raw) and will identify any samples that are inconsistent with the rest of the data. From the plot, we can observe that any data points that fall into the red squares are indeed outlying and should be removed from analysis.
To confirm the previous identity dendrogram with no obvious outliers, I run the chunk below to see if there are any sample that falls within the two red squares in the plot.
NOTE: No samples fall into the red squares, so no outlier confirmed.
betas <- getBeta(AMD_ExtendedRGSet)
detout <- outlyx(betas)detout$Sample_Name <- rownames(detout)
detout[which(detout$outliers == T),]## [1] iqr mv outliers Sample_Name
## <0 rows> (or 0-length row.names)
minfi has a sex predictor function that uses the median values of measurements on the X and Y chromosomes respectively. If yMed - xMed is less than cutoff of -2, the sample is predicted as female; otherwise it is predicted as male.
#Sex prediction using minfi's getSex() function.
AMD_gRSet <- mapToGenome(AMD_MSet) #Convert to GenomicMethylSet object.
predSex <- getSex(AMD_gRSet)
head(predSex)## DataFrame with 6 rows and 3 columns
## xMed yMed predictedSex
## <numeric> <numeric> <character>
## Sample 1 12.08015127114 12.2494097548417 M
## Sample 10 12.2550285396274 12.4141363144996 M
## Sample 11 12.3826240265749 12.5400006922142 M
## Sample 12 12.2884335395868 12.4617801274941 M
## Sample 13 12.1611318829843 12.3721820668771 M
## Sample 14 12.8814007851121 8.92777492058433 F
#Compare to Sex from metadata.
AMD_samplesheet$Sex <- as.factor(AMD_samplesheet$Sex)
predSex$predictedSex <- as.factor(predSex$predictedSex)
all.equal(AMD_samplesheet$Sex, predSex$predictedSex) #TRUE.## [1] TRUE
#Plot predicted sex against reported sex from metadata.
predictedSex <- as.data.frame(predSex)
ggplot(predictedSex, aes(x = xMed, y = yMed, fill = AMD_samplesheet$Sex)) +
geom_point(shape = 21, size = 3, alpha = 0.8, col = "black") +
scale_fill_manual(values = c("#bd7b9f", "#2c7dab")) +
theme_classic() +
theme(axis.text.x = element_text(size = 14),
axis.text.y = element_text(size = 14),
axis.title.x = element_text(size = 15, vjust = -0.3),
axis.title.y = element_text(size = 15, vjust = 2),
legend.position = "none")The sex of the samples cluster accordingly.
ewastools has a similar sex predictor function that computes for each sample the average total intensities of all probes targeting either chromosome, X and Y respectively (There are 11,232 probes that target the X chromosome and 413 probes that target the Y chromosome). This function exploits the natural difference in allosomal (sex) copy number (with females having more copy number than males) and the fact that total intensity (U + M) is sensitive to copy number variation to detect sex mismatches. The threshold to discriminate between both sexes is determined by the Hodges-Lehmann estimator (i.e. median of all pairwise male/female averages) for X and Y chromosomes separately. The dotted lines in the figure below represent the Hodges-Lehman estimators separating the male and female cluster centres. The male samples should cluster in the top left quadrant while the female samples should cluster in the bottom right quadrant. The samples that fall in the top right and bottom left quadrants are considered “unclear” (Heiss & Just 2018).
It is suggested that this approach is more robust than minfi’s getSex() function due to its potential to detect sex mismatches and allosomal outliers.
#Sex prediction using ewastools' check_sex() function.
predicted_sex <- check_sex(meth)
AMD_samplesheet <- AMD_samplesheet %>% mutate(normalized_X_intensity = predicted_sex$X,
normalized_Y_intensity = predicted_sex$Y)
#Sex plot.
ggplot(AMD_samplesheet, aes(x = normalized_X_intensity, y = normalized_Y_intensity, fill = Sex)) +
geom_point(shape = 21, size = 3, alpha = 0.8, col = "black") + theme_classic() +
scale_fill_manual(values = c("#bd7b9f", "#2c7dab")) +
geom_text_repel(data = AMD_samplesheet %>% filter(Sex == 'M', normalized_X_intensity > 0.95,
normalized_Y_intensity < 0.5),
aes(label = Sample_Name), size = 3, force = 15, nudge_x = -0.1, nudge_y = -0.1) +
geom_hline(yintercept = 0.5, linetype = 'dashed', col = '#bd7b9f') +
geom_vline(xintercept = 0.95, linetype = 'dashed', col = '#2c7dab') +
theme(axis.text.x = element_text(size = 14),
axis.text.y = element_text(size = 14),
axis.title.x = element_text(size = 15, vjust = -0.3),
axis.title.y = element_text(size = 15, vjust = 2),
legend.position = "none")This second check confirms no sex mismatches in the samples.
Code from Nicole Gladish, Rachel Edgar and Sumaiya Islam.
Location: ~/KoborLab/kobor_space/shared_coding_resource/PCA Code.Rmd
betas <- getBeta(AMD_ExtendedRGSet) #Essentially use your mvalue matrix.
meta <- pData(AMD_ExtendedRGSet)
PCA_full <- princomp(na.omit(betas)) #You can't have NAs in your dataframe - make sure to either remove probes with a lot of NAs and/or have imputed values. Can run na.omit but depending on the stage of pre-processing, could result in a lot of probes to be removed and a very inaccurate PCA.
Loadings <- as.data.frame(unclass(PCA_full$loadings))
vars <- PCA_full$sdev^2
Importance <- vars/sum(vars)
adjust <- 1-Importance[1]
pca_adjusted <- Importance[2:length(Importance)]/adjust
pca_df <- data.frame(adjusted_variance = pca_adjusted, PC = seq(1:length(pca_adjusted)))
#Restructure meta so that variables are in the appropriate format - categorical variables (sex is commonly labelled as 0 and 1) are factors and not numeric for example.
colnames(meta)[6] <- "Sentrix_Position"
colnames(meta)[7] <- "Sentrix_ID"
meta$Sentrix_ID <- as.factor(meta$Sentrix_ID)
meta$Sentrix_Position <- as.factor(meta$Sentrix_Position)
meta$Sex <- as.factor(meta$Sex)
meta$Sample_Group <- as.factor(meta$Sample_Group)
meta$Age <- as.numeric(meta$Age)
colnames(meta) #Obtain the column numbers to include.## [1] "Sample_Name" "Sample_Group" "Sex" "Age"
## [5] "Tissue" "Sentrix_Position" "Sentrix_ID" "Basename"
## [9] "filenames"
meta_categorical <- data.frame(meta[, c(2,3,6,7)]) #Input column numbers in meta that contain categorical variables.
meta_continuous <- data.frame(meta[,4]) #Input column numbers in meta that contain continuous variables.
colnames(meta_categorical) #Write the line below to ensure you're changing the names of your variables in the right order.## [1] "Sample_Group" "Sex" "Sentrix_Position" "Sentrix_ID"
colnames(meta_categorical) <- c("Disease state", "Sex", "Row", "Chip")
colnames(meta_continuous)## [1] "meta...4."
colnames(meta_continuous) <- c("Age")
Order <- c(seq(1:sum(ncol(meta_categorical), ncol(meta_continuous))))
Num <- 16 #This number will depend on your dataset - if you only have 6 samples, you should probably only show 5 PCs or less.#Run PCA.
source("~/KoborLab/kobor_space/kendrix/R_Functions/heat_scree_plot.R", local = knitr::knit_global())
heat_scree_plot(Loadings, Importance, Num, Order)AMD_MSet <- preprocessRaw(AMD_ExtendedRGSet) #Get MSet object from RGSet.
plotBetasByType(AMD_MSet[,1])#Perform background correction.
AMD_MSet.illumina <- preprocessIllumina(AMD_ExtendedRGSet, bg.correct = TRUE,
normalize = "control")
#Plot to visualise post-normalisation distribution.
plotBetasByType(AMD_MSet.illumina[,1])Subset-quantile within array normalization (SWAN) (Jovana Maksimovic, Lavinia Gordon, and Alicia Oshlack 2012) is a within-array normalization correction for the technical differences between the Type I and Type II array designs. The algorithm matches the beta-value distributions of the Type I and Type II probes by applying a within-array quantile normalization separately for different subsets of probes (divided by CpG content).
AMD_MSet <- preprocessRaw(AMD_ExtendedRGSet) #Get MSet object from RGSet.
#SWAN uses a random subset of probes to do the between array normalization. In order to achive reproducible results, the seed needs to be set using set.seed.
set.seed(100)
#Perform SWAN normalisation.
AMD_MSet.swan <- preprocessSWAN(AMD_ExtendedRGSet, mSet = AMD_MSet)
#Plot to visualise post-normalisation distribution.
plotBetasByType(AMD_MSet.swan[,1])save(AMD_MSet.swan, file = "~/KoborLab/kobor_space/kendrix/macular_degeneration/data/AMD_MSet.swan.RData")This function implements stratified quantile normalization preprocessing. The normalization procedure is applied to the Meth and Unmeth intensities separately. The distribution of type I and type II signals is forced to be the same by first quantile normalizing the type II probes across samples and then interpolating a reference distribution to which we normalize the type I probes. Since probe types and probe regions are confounded and we know that DNAm distributions vary across regions we stratify the probes by region before applying this interpolation. Note that this algorithm relies on the assumptions necessary for quantile normalization to be applicable and thus is not recommended for cases where global changes are expected such as in cancer-normal comparisons.
The different arguments for the function can be summarized into the following list:
If fixOutliers is TRUE, the functions fixes outliers of both the methylated and unmethylated channels when small intensities are close to zero.
If removeBadSamples is TRUE, it removes bad samples using the QC criterion discussed previously.
Performs stratified subset quantile normalization if quantileNormalize = TRUE and stratified = TRUE.
Predicts the sex (if not provided in the sex argument) using the function getSex and normalizes males and females separately for the probes on the X and Y chromosomes.
#Perform quantile normalisation.
AMD_gRSet.quantile <- preprocessQuantile(AMD_ExtendedRGSet,
fixOutliers = TRUE,
removeBadSamples = TRUE, badSampleCutoff = 10.5,
quantileNormalize = TRUE, stratified = TRUE,
mergeManifest = FALSE, sex = NULL)## [preprocessQuantile] Mapping to genome.
## [preprocessQuantile] Fixing outliers.
## [preprocessQuantile] Quantile normalizing.
#Get manifest information for plot.
probeTypes <- data.frame(Name = featureNames(AMD_gRSet.quantile),
Type = getProbeType(AMD_gRSet.quantile))
#Plot to visualise post-normalisation distribution.
betas.quantile <- getBeta(AMD_gRSet.quantile)
plotBetasByType(betas.quantile[,1], probeTypes = probeTypes)#Perform noob normalisation.
AMD_MSet.noob <- preprocessNoob(AMD_ExtendedRGSet)
#Plot to visualise post-normalisation distribution.
plotBetasByType(AMD_MSet.noob[,1])save(AMD_MSet.noob, file = "~/KoborLab/kobor_space/kendrix/macular_degeneration/data/AMD_MSet.noob.RData")BMIQ normalisation uses quantiles to normalise the Type II probe values into a distribution comparable to the Type I probes using a beta-mixture model fit ot the Type I and Type II probes separately and then transforms the probabilities of class membership of the Type II probes into quantiles of beta values using the parameters of the beta-distributions of the Type I distribution. This method uses a three-state beta-mixture model but does not use fit to the middle ‘hemi-methylated’ component in the normalisation, therefore it does not require a trimodal distribution. The advantage of BMIQ is that it avoids selecting subsets of probes matched for biological characteristics as done in SWAN and quantile normalisation, and was found to be the best algorithm for reducing probe design bias. However, it is also a more aggressive method of normalisation, so in the attempt to make the Type II probe values more similar to the Type I probe values, it may have inadvertently remove biologically meaningful values from the distributions.
#Perform BMIQ normalisation.
betas.BMIQnoob <- BMIQ(AMD_MSet.noob, nfit = 100000)
#Save BMIQ object.
save(betas.BMIQnoob, file= "~/KoborLab/kobor_space/kendrix/macular_degeneration/data/betas.BMIQnoob.RData")load("~/KoborLab/kobor_space/kendrix/macular_degeneration/data/betas.BMIQnoob.RData")
#Plot to visualise post-normalisation distribution.
#Download 450K manifest from shared coding space.
load("~/KoborLab/kobor_space/shared_coding_resource/HM450_fdat.RData")
dim(fData_450)## [1] 485577 37
#Check if the CpGs correspond with the CpGs from the manifest.
manifest <- fData_450[rownames(fData_450)%in%rownames(betas.BMIQnoob),]
dim(manifest)## [1] 485512 37
#Subset data based on probe types - Either Type 1 or Type 2.
probe_types <- fData_450$INFINIUM_DESIGN_TYPE
#Subset Type 1 probe data.
type_1 <- subset(fData_450, fData_450$INFINIUM_DESIGN_TYPE == "I")
dim(type_1)## [1] 135501 37
#See how many Type 1 CpGs correspond with the Type 1 probe data.
type_1betas <- betas.BMIQnoob[rownames(betas.BMIQnoob)%in%rownames(type_1),]
dim(type_1betas)## [1] 135476 44
#Subset Type 2 probe data.
type_2 <- subset(fData_450, fData_450$INFINIUM_DESIGN_TYPE == "II")
dim(type_2)## [1] 350076 37
#See how many Type 2 CpGs correspond with the Type 2 probe data.
type_2betas <- betas.BMIQnoob[rownames(betas.BMIQnoob)%in%rownames(type_2),]
dim(type_2betas)## [1] 350036 44
#Plot BMIQ normalised data.
plot(c(0,1), c(0,10), xlab = "Beta values", ylab = "Density", main = "")
lines(density(na.omit(type_1betas)), col = "red", lty = 2, lwd = 2)
lines(density(na.omit(type_2betas)), col = "blue", lty = 1, lwd = 2)
legend("top", inset = 0.05, cex = 1.0, c("Type 1", "Type 2"), col = c("red", "blue"), lty = c(2,1), horiz = FALSE)The function preprocessFunnorm implements the functional normalization algorithm developed in Jean-Philippe Fortin et al. 2014. Briefly, it uses the internal control probes present on the array to infer between-array technical variation. It is particularly useful for studies comparing conditions with known large-scale differences, such as cancer/normal studies, or between-tissue studies. It has been shown that for such studies, functional normalization outperforms other existing approaches (Jean-Philippe Fortin et al. 2014). By default, the function applies the preprocessNoob function as a first step for background substraction, and uses the first two principal components of the control probes to infer the unwanted variation.
#Perform funnorm normalisation.
AMD_gRSet.funnorm <- preprocessFunnorm(AMD_ExtendedRGSet)## [preprocessFunnorm] Background and dye bias correction with noob
## [preprocessFunnorm] Mapping to genome
## [preprocessFunnorm] Quantile extraction
## [preprocessFunnorm] Normalization
#Get manifest information for plot.
probeTypes <- data.frame(Name = featureNames(AMD_gRSet.funnorm),
Type = getProbeType(AMD_gRSet.funnorm))
#Plot to visualise post-normalisation distribution.
betas.funnorm <- getBeta(AMD_gRSet.funnorm)
plotBetasByType(betas.funnorm[,1], probeTypes = probeTypes)save(AMD_gRSet.funnorm, file = "~/KoborLab/kobor_space/kendrix/macular_degeneration/data/AMD_gRSet.RData")Here, I am performing PCA on funnorm normalised betas.
#Load the normalised beta matrix of choice.
betas <- betas.funnorm
meta <- pData(AMD_gRSet.funnorm)
PCA_full <- princomp(betas) #You can't have NAs in your dataframe - make sure to either remove probes with a lot of NAs and/or have imputed values. Can run na.omit but depending on the stage of pre-processing, could result in a lot of probes to be removed and a very inaccurate PCA.
Loadings <- as.data.frame(unclass(PCA_full$loadings))
vars <- PCA_full$sdev^2
Importance <- vars/sum(vars)
adjust <- 1-Importance[1]
pca_adjusted <- Importance[2:length(Importance)]/adjust
pca_df <- data.frame(adjusted_variance = pca_adjusted, PC = seq(1:length(pca_adjusted)))
#Restructure meta so that variables are in the appropriate format - categorical variables (sex is commonly labelled as 0 and 1) are factors and not numeric for example.
colnames(meta)[6] <- "Sentrix_Position"
colnames(meta)[7] <- "Sentrix_ID"
meta$Sentrix_ID <- as.factor(meta$Sentrix_ID)
meta$Sentrix_Position <- as.factor(meta$Sentrix_Position)
meta$Sex <- as.factor(meta$Sex)
meta$Sample_Group <- as.factor(meta$Sample_Group)
meta$Age <- as.numeric(meta$Age)
colnames(meta) #Obtain the column numbers to include.## [1] "Sample_Name" "Sample_Group" "Sex" "Age"
## [5] "Tissue" "Sentrix_Position" "Sentrix_ID" "Basename"
## [9] "filenames" "xMed" "yMed" "predictedSex"
meta_categorical <- data.frame(meta[, c(2,3,6,7)]) #Input column numbers in meta that contain categorical variables.
meta_continuous <- data.frame(meta[,4]) #Input column numbers in meta that contain continuous variables.
colnames(meta_categorical) #Write the line below to ensure you're changing the names of your variables in the right order.## [1] "Sample_Group" "Sex" "Sentrix_Position" "Sentrix_ID"
colnames(meta_categorical) <- c("Disease state", "Sex", "Row", "Chip")
colnames(meta_continuous)## [1] "meta...4."
colnames(meta_continuous) <- c("Age")
Order <- c(seq(1:sum(ncol(meta_categorical), ncol(meta_continuous))))
Num <- 16 #This number will depend on your dataset - if you only have 6 samples, you should probably only show 5 PCs or less.#Run PCA.
source("~/KoborLab/kobor_space/kendrix/R_Functions/heat_scree_plot.R", local = knitr::knit_global())
heat_scree_plot(Loadings, Importance, Num, Order)#Get raw betas to compare.
betas.raw <- getBeta(AMD_ExtendedRGSet)
#Check sample order for sample-sample correlation.
identical(rownames(betas.raw), rownames(betas.funnorm)) #FALSE.## [1] FALSE
identical(colnames(betas.raw), colnames(betas.funnorm)) #TRUE.## [1] TRUE
#Reorder probe order.
betas.raw <- betas.raw[order(rownames(betas.raw)),]
betas.funnorm <- betas.funnorm[order(rownames(betas.funnorm)),]
#Recheck sample order for sample-sample correlation.
identical(rownames(betas.raw), rownames(betas.funnorm)) #TRUE.## [1] TRUE
identical(colnames(betas.raw), colnames(betas.funnorm)) #TRUE.## [1] TRUE
#Sample-sample correlation.
cor.raw_funnorm <- cor(betas.raw, betas.funnorm, use = "pairwise.complete.obs") #Input: Beta values to compare.
plot(xaxt = "n", ylim = c(0.95, 1), xlab = "", ylab = "", diag(cor.raw_funnorm), main = "Sample-sample correlation", pch = 16, panel.first = grid()) +
axis(1, 1:44, labels = rownames(cor.raw_funnorm), las = 3, cex.axis = 0.8)## numeric(0)
#Difference in beta values before and after normalisation.
diff = betas.funnorm - betas.raw
hist(diff, cex.axis = 0.8, breaks = 100, xlab = "", ylab = "", xlim = c(-0.8, 0.8), las = 1, main = "Difference in beta values before and after normalisation")Probes that target the sex chromosomes are removed as both males and females have unequal amount X chromosomes - with females having twice the amount of X chromosomes as compared to males. So they need to be removed so that the analysis is not skewed on the basis of this disproportionate natural difference in chromosomal number.
load("~/KoborLab/kobor_space/kendrix/macular_degeneration/data/AMD_gRSet.RData")
data(IlluminaHumanMethylation450kmanifest)
data(Locations)
#Check probe count before removal.
dim(AMD_gRSet.funnorm) #485512 probes.## [1] 485512 44
#Get sex probes information from manifest file.
sex_probes <- Locations[which(Locations$chr == "chrY" | Locations$chr == "chrX"), ] %>% as.data.frame()
dim(sex_probes) #19627 probes.## [1] 19627 3
#Check the number of sex probes in the beta matrix.
AMD_sex_probes <- AMD_gRSet.funnorm[which(rownames(AMD_gRSet.funnorm) %in% rownames(sex_probes)),]
dim(AMD_sex_probes) #10583 probes.## [1] 10583 44
#Remove sex probes from AMD beta matrix.
AMD_gRSet.funnorm <- AMD_gRSet.funnorm[!(rownames(AMD_gRSet.funnorm)%in%rownames(sex_probes)),]
dim(AMD_gRSet.funnorm) #474929 probes remaining after sex probes filtering. ## [1] 474929 44
Non-specific and cross-hybridising probes are filtered according to the probes identified by Chen et al. and Langmead & Salzberg. There are probes that target multiple sites in the human genome and/or pose potential hybridisation issues, and thus result in inaccurate methylation signals detection.
Cross-reactive probes target highly repetitive sequences or co-hybridise to alternate sequences that are highly homologous to the intended targets, which could lead to the detection of spurious signals and potentially resulting in invalud conclusions and lack of validation in downstream analyses (Chen et al. 2013). Chen et al. (2013) proposes using the minimum number of bases matched to unintended targets of 47 bases for Infinium I and II probes to be used as criteria to identify cross-reactivity. They identified 8.4% of the Infinium I probes and 5.1% of the Infinium II probes (total: 6.0%) to be cross-reactive.
#Load Price annotation.
load("~/KoborLab/kobor_space/shared_coding_resource/Illumina_EPIC/fData_450_Price.RData")
#XY cross-hybridising probes.
XY_cross_hyb <- fData_450_Price[fData_450_Price$XY_Hits == "XY_YES",]$ILMNID
length(XY_cross_hyb) #12388 XY cross-hybridising probes.## [1] 12388
AMD_gRSet.funnorm <- AMD_gRSet.funnorm[!(rownames(AMD_gRSet.funnorm) %in% XY_cross_hyb),]
dim(AMD_gRSet.funnorm) #463426 probes after XY cross-hybridising probes removal.## [1] 463426 44
#Autosomal cross-hybridising probes.
auto_cross_hyb <- fData_450_Price[fData_450_Price$Autosomal_Hits == "A_YES",]$ILMNID
length(auto_cross_hyb) #40650 autosomal cross-hybridising probes.## [1] 40650
AMD_gRSet.funnorm <- AMD_gRSet.funnorm[!(rownames(AMD_gRSet.funnorm) %in% auto_cross_hyb),]
dim(AMD_gRSet.funnorm) #434216 probes after autosomal cross-hybridising probes removal.## [1] 434216 44
#Load cross-hybridising and multi-mapped probes.
#cross.react <- read.csv("~/KoborLab/kobor_space/kendrix/macular_degeneration/data/Non_specific_probes_450K.csv", header = TRUE, as.is = TRUE)
#cross.react.probes <- as.character(cross.react$TargetID)
#multi.map <- read.table("~/KoborLab/kobor_space/kendrix/macular_degeneration/data/Cross_hybridising_probes_450K.txt", header = FALSE, as.is = TRUE)
#multi.map.probes <- as.character(multi.map$V1)
#Check for overlap between the two probe sets.
#filter.probes <- unique(c(cross.react.probes, multi.map.probes))
#length(filter.probes) #38941 probes.
#Remove cross-hybridising and multi-mapped probes.
#table(rownames(betas.BMIQnoob) %in% filter.probes) #37760 cross-hybridising and multi-mapped probes in BMIQ-normalised beta matrix.
#betas.BMIQnoob <- betas.BMIQnoob[!(rownames(betas.BMIQnoob) %in% filter.probes),]
#dim(betas.BMIQnoob) #437169 probes remaining after cross-hybridising and multi-mapped probes are filtered.Polymorphic probes are probes that target CpG sites that overlap known SNPs. The methylation levels detected for such CpGs can be greatly influenced by the underlying genetic polymorphism - thus they should be interpreted with caution (Chen et al. 2013). By cross-matching the genomic positions of both C and G of all array-targeted CpGs and the position of single base extension (Infinium I) to that of known SNPs in the 1,000 Genome database, we found 9.4% of the Infinium I probes and 15.5% of the Infinium II probes (total probes: 13.8%) to have methylation levels that could potentially be affected by genetic polymorphism. Caveat: Although it is shown that methylation profile from the Illumina HM450 microarray could be greatly affected by genetic polymorphism, the majority of the SNPs are rare with very low alternative allele frequencies, thus they would not be expected to have a major effect on methylation data when the population under study does not demonstrate a significant frequency of the rare allele. Studies that are focused on intraindividual differences rather than interindividual differences (such as tumour/normal tisssue differences; longitudinal evaluation of methylation profiles; monozygotic twin studies) are not expected to be confounded by such underlying SNPs.
Pidsley et al. identify overlap with genetic variant categories with minor allele frequency > 5% at: (1) target CpG sites (n = 12,378); (2) single base extension sites of Type I probes (n = 772) using the EPIC array.
polymorphic_probes <- read.csv("~/KoborLab/kobor_space/kendrix/R_Functions/Pidsley_EPIC_polymorphic_probes.csv", header = TRUE)
polymorphic_probes <- as.character(polymorphic_probes$PROBE)
length(polymorphic_probes) #12510 polymorphic probes.## [1] 12510
polymorphic_base_ext <- read.csv("~/KoborLab/kobor_space/kendrix/R_Functions/Pidsley_EPIC_single_base_ext_overlapping_SNP.csv", header = TRUE)
polymorphic_base_ext <- as.character(polymorphic_base_ext$PROBE)
length(polymorphic_base_ext) #414 probes that overlap with SNP at single base extension sites.## [1] 414
#Get overlap between two probe sets.
polymorphic_probes_to_filter <- unique(c(polymorphic_probes, polymorphic_base_ext))
length(polymorphic_probes_to_filter) #12679 probes to filter. ## [1] 12679
#Remove polymorphic probes.
AMD_gRSet.funnorm <- AMD_gRSet.funnorm[!(rownames(AMD_gRSet.funnorm) %in% polymorphic_probes_to_filter),]
dim(AMD_gRSet.funnorm) #427786 probes after polymorphic probes removal.## [1] 427786 44
load("~/KoborLab/kobor_space/kendrix/macular_degeneration/data/AMD_detp_beadcount_QC.RData")
AMD_samplesheet <- read.csv("~/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/AMD_samplesheet.csv", header = TRUE)
#Create a dummy matrix
bad_probes <- matrix(data = F, nrow = nrow(detp_minfi), ncol = ncol(detp_minfi),
dimnames = list(rownames(detp_minfi), colnames(detp_minfi))) %>% as.data.frame
#Designate TRUE to where beadcount < 3 and detection p-value > 0.01.
bad_probes[is.na(bead)] <- T
bad_probes[detp_minfi > 0.01] <- T
#Number of failed probes by samples.
AMD_samplesheet <- AMD_samplesheet %>% mutate(bad_probes = colSums(bad_probes))
#Check for samples that fail in more than 1% of total probes that have beadcount < 3 and detection p-value > 0.01.
AMD_samplesheet %>%
mutate(Sample_Name = factor(as.character(Sample_Name), levels = Sample_Name)) %>%
ggplot(aes(x = Sample_Name, y = bad_probes)) +
geom_point() +
labs(x = 'Samples', y = '# probes with bead count < 3 or detection p > 0.01') +
geom_hline(yintercept = 0.01*nrow(bad_probes), linetype = 'dashed', color = 'red') +
geom_text(aes(x = 0, y = 0.01*nrow(bead)), label = '1%', vjust = -0.5, hjust = -0.5, color = 'red') +
geom_label_repel(data = AMD_samplesheet %>% filter(bad_probes > 0.01*nrow(detp_minfi)),
aes(x = Sample_Name, y = bad_probes, label = Sample_Name),
force = 100, show.legend = F) +
theme_classic() + theme(axis.text.x = element_blank()) #Check for probes that are unsuccesfully measured in nth% of samples.
n_samples <- ncol(AMD_gRSet.funnorm)
bad_probes_count <- rowSums(bad_probes)
#Check thresholds to determine number of probes to remove:
#Probes that are unsuccessfully measured in 1% of samples.
sum(bad_probes_count > 0.010*n_samples) #18814 probes.## [1] 18814
#Probes that are unsuccessfully measured in 2.5% of samples.
sum(bad_probes_count > 0.025*n_samples) #5050 probes.## [1] 5050
#Probes that are unsuccesfully measured in 5% of samples.
sum(bad_probes_count > 0.050*n_samples) #3318 probes.## [1] 3318
#Remove probes that are unsuccessfully measured in 5% of samples.
remove_probes <- data.frame(probe_ID = rownames(detp_minfi)) %>% mutate(bad_probes_count = bad_probes_count) %>% filter(bad_probes_count > 0.05*n_samples)
AMD_gRSet.funnorm.filt <- AMD_gRSet.funnorm[!(rownames(AMD_gRSet.funnorm) %in% remove_probes$probe_ID),]
dim(AMD_gRSet.funnorm.filt) #425456 probes remaining after detection p-value and beadcount probe filtering. ## [1] 425456 44
#Load the probe filtered beta matrix.
betas.funnorm <- getBeta(AMD_gRSet.funnorm.filt)
meta <- pData(AMD_gRSet.funnorm.filt)
PCA_full <- princomp(betas.funnorm) #You can't have NAs in your dataframe - make sure to either remove probes with a lot of NAs and/or have imputed values. Can run na.omit but depending on the stage of pre-processing, could result in a lot of probes to be removed and a very inaccurate PCA.
Loadings <- as.data.frame(unclass(PCA_full$loadings))
vars <- PCA_full$sdev^2
Importance <- vars/sum(vars)
adjust <- 1-Importance[1]
pca_adjusted <- Importance[2:length(Importance)]/adjust
pca_df <- data.frame(adjusted_variance = pca_adjusted, PC = seq(1:length(pca_adjusted)))
#Restructure meta so that variables are in the appropriate format - categorical variables (sex is commonly labelled as 0 and 1) are factors and not numeric for example.
colnames(meta)[6] <- "Sentrix_Position"
colnames(meta)[7] <- "Sentrix_ID"
meta$Sentrix_ID <- as.factor(meta$Sentrix_ID)
meta$Sentrix_Position <- as.factor(meta$Sentrix_Position)
meta$Sex <- as.factor(meta$Sex)
meta$Sample_Group <- as.factor(meta$Sample_Group)
meta$Age <- as.numeric(meta$Age)
colnames(meta) #Obtain the column numbers to include.## [1] "Sample_Name" "Sample_Group" "Sex" "Age"
## [5] "Tissue" "Sentrix_Position" "Sentrix_ID" "Basename"
## [9] "filenames" "xMed" "yMed" "predictedSex"
meta_categorical <- data.frame(meta[, c(2,3,6,7)]) #Input column numbers in meta that contain categorical variables.
meta_continuous <- data.frame(meta[,4]) #Input column numbers in meta that contain continuous variables.
colnames(meta_categorical) #Write the line below to ensure you're changing the names of your variables in the right order.## [1] "Sample_Group" "Sex" "Sentrix_Position" "Sentrix_ID"
colnames(meta_categorical) <- c("Disease state", "Sex", "Row", "Chip")
colnames(meta_continuous)## [1] "meta...4."
colnames(meta_continuous) <- c("Age")
Order <- c(seq(1:sum(ncol(meta_categorical), ncol(meta_continuous))))
Num <- 16 #This number will depend on your dataset - if you only have 6 samples, you should probably only show 5 PCs or less.#Run PCA.
source("~/KoborLab/kobor_space/kendrix/R_Functions/heat_scree_plot.R", local = knitr::knit_global())
heat_scree_plot(Loadings, Importance, Num, Order)save(AMD_gRSet.funnorm.filt, file = "~/KoborLab/kobor_space/kendrix/macular_degeneration/data/AMD_gRSet.funnorm.filt.RData")library(EpiDISH)
#Load combat corrected data.
load("~/KoborLab/kobor_space/kendrix/macular_degeneration/data/AMD_gRSet.funnorm.filt.RData")
AMD_pData <- as.data.frame(pData(AMD_gRSet.funnorm.filt))
betas.funnorm.filt <- getBeta(AMD_gRSet.funnorm.filt)
#Sanity checks.
sum(is.na(betas.funnorm.filt)) #0 NA.## [1] 0
sum(is.infinite(betas.funnorm.filt)) #0 inf. values.## [1] 0
#Load EpiDISH reference.
load("~/KoborLab/kobor_space/shared_coding_resource/EpiDish_Refs/centEpiFibIC.m.rda")
load("~/KoborLab/kobor_space/shared_coding_resource/EpiDish_Refs/centBloodSub.m.rda")
#Check to see if all the probes in the EpiDISH reference are in the combat corrected dataset.
all(rownames(centBloodSub.m) %in% rownames(betas.funnorm.filt)) #FALSE.## [1] FALSE
dim(centBloodSub.m[which(!rownames(centBloodSub.m) %in% rownames(betas.funnorm.filt)),]) #10 probes in the reference not in the combat corrected dataset.## [1] 10 7
all(rownames(centEpiFibIC.m) %in% rownames(betas.funnorm.filt)) #FALSE.## [1] FALSE
dim(centEpiFibIC.m[which(!rownames(centEpiFibIC.m) %in% rownames(betas.funnorm.filt)),]) #27 probes in the reference not in the combat corrected dataset.## [1] 27 3
AMD_Epi_counts <- as.data.frame(hepidish(betas.funnorm.filt, ref1.m = centEpiFibIC.m, ref2.m = centBloodSub.m, method = "RPC", h.CT.idx = 3))
#Plot EpiDISH cell type prediction.
AMD_Epi.melt <- gather(AMD_Epi_counts)
colnames(AMD_Epi.melt)[1] <- "Cell_type"
colnames(AMD_Epi.melt)[2] <- "Estimated_proportions"
ggplot(AMD_Epi.melt, aes(factor(Cell_type), Estimated_proportions)) +
geom_boxplot(aes(fill = Cell_type), outlier.shape = NA) +
geom_jitter(aes(fill = Cell_type), size = 2, shape = 21, alpha = 0.7,
position = position_jitterdodge(dodge.width = 0.7, jitter.width = 0.1)) +
labs(x = "Cell Type", y = "Estimated Proportions",
caption = "Estimated cell type proportion using EpiDISH") +
scale_fill_manual(values = c("#7F44AB", "#56B4E9", "#E69F00", "#CC79A7", "#0072B2", "#009E73", "#D55E00", "#994C00", "#FFCCCC")) +
theme_classic() + theme(plot.caption = element_text(hjust = 0.5, size = 12), legend.position = "none")meta <- as.data.frame(AMD_pData)
#Check sample order to add cell type prediction to metadata.
identical(rownames(meta), rownames(AMD_Epi_counts)) #TRUE.## [1] TRUE
#Add cell type predictions to metadata.
meta <- cbind(meta, AMD_Epi_counts)
M_values.funnorm.filt <- beta2m(betas.funnorm.filt)
PCA_full <- princomp(M_values.funnorm.filt) #You can't have NAs in your dataframe - make sure to either remove probes with a lot of NAs and/or have imputed values. Can run na.omit but depending on the stage of pre-processing, could result in a lot of probes to be removed and a very inaccurate PCA.
Loadings <- as.data.frame(unclass(PCA_full$loadings))
vars <- PCA_full$sdev^2
Importance <- vars/sum(vars)
adjust <- 1-Importance[1]
pca_adjusted <- Importance[2:length(Importance)]/adjust
pca_df <- data.frame(adjusted_variance = pca_adjusted, PC = seq(1:length(pca_adjusted)))
#Restructure meta so that variables are in the appropriate format - categorical variables (sex is commonly labelled as 0 and 1) are factors and not numeric for example.
colnames(meta)[6] <- "Sentrix_Position"
colnames(meta)[7] <- "Sentrix_ID"
meta$Sentrix_ID <- as.factor(meta$Sentrix_ID)
meta$Sentrix_Position <- as.factor(meta$Sentrix_Position)
meta$Sex <- as.factor(meta$Sex)
meta$Sample_Group <- as.factor(meta$Sample_Group)
meta$Age <- as.numeric(meta$Age)
meta$Epi <- as.numeric(meta$Epi)
meta$Fib <- as.numeric(meta$Fib)
meta$B <- as.numeric(meta$B)
meta$NK <- as.numeric(meta$NK)
meta$CD4T <- as.numeric(meta$CD4T)
meta$CD8T <- as.numeric(meta$CD8T)
meta$Mono <- as.numeric(meta$Mono)
meta$Neutro <- as.numeric(meta$Neutro)
meta$Eosino <- as.numeric(meta$Eosino)
colnames(meta) #Obtain the column numbers to include.## [1] "Sample_Name" "Sample_Group" "Sex" "Age"
## [5] "Tissue" "Sentrix_Position" "Sentrix_ID" "Basename"
## [9] "filenames" "xMed" "yMed" "predictedSex"
## [13] "Epi" "Fib" "B" "NK"
## [17] "CD4T" "CD8T" "Mono" "Neutro"
## [21] "Eosino"
meta_categorical <- data.frame(meta[, c(2,3,6,7)]) #Input column numbers in meta that contain categorical variables.
meta_continuous <- data.frame(meta[, c(4,13,14,15,16,17,19,20,21)]) #Input column numbers in meta that contain continuous variables.
colnames(meta_categorical) #Write the line below to ensure you're changing the names of your variables in the right order.## [1] "Sample_Group" "Sex" "Sentrix_Position" "Sentrix_ID"
colnames(meta_categorical) <- c("Disease state", "Sex", "Row", "Chip")
colnames(meta_continuous)## [1] "Age" "Epi" "Fib" "B" "NK" "CD4T" "Mono" "Neutro"
## [9] "Eosino"
colnames(meta_continuous) <- c("Age", "Epithelial", "Fibroblast", "B Cell", "NK Cell", "CD4T", "Monocyte", "Neutrophil", "Eosinophil")
Order <- c(seq(1:sum(ncol(meta_categorical), ncol(meta_continuous))))
Num <- 16 #This number will depend on your dataset - if you only have 6 samples, you should probably only show 5 PCs or less.#Run PCA.
source("~/KoborLab/kobor_space/kendrix/R_Functions/heat_scree_plot.R", local = knitr::knit_global())
heat_scree_plot(Loadings, Importance, Num, Order)save(AMD_Epi_counts, file = "~/KoborLab/kobor_space/kendrix/macular_degeneration/data/EpiDISH_celltype_prediction.RData")Surrogate variable analysis (SVA) (Jeffrey T Leek and John D Storey 2007, Jeffrey T Leek and John D Storey (2008)) is a useful tool to identified surrogate variables for unwanted variation while protecting for a phenotype of interest. In our experience, running SVA after normalizing the 450K data with preprocessFunnorm or preprocessQuantile increases the statistical power of the downstream analysis. For instance, to run SVA on the M-values, protecting for case-control status, the following code can be used to estimate the surrogate variables (this can take a few hours to run).
Input: Instead of using ComBat-corrected data to determine SVs, use probe-filtered data instead because Disease State may be confounded with Row, so using ComBat will wipe out the Disease State effects.
NOTE: 1 significant SV.
#Load objects.
load("~/KoborLab/kobor_space/kendrix/macular_degeneration/data/AMD_gRSet.funnorm.filt.RData")
AMD_pData <- as.data.frame(pData(AMD_gRSet.funnorm.filt))
M_values.funnorm.filt <- getM(AMD_gRSet.funnorm.filt)
#SVA with Disease State as fixed variable.
mod <- model.matrix(~ as.numeric(Age) + as.factor(Sample_Group) + as.factor(Sex) + as.factor(Slide) + as.factor(Array), data = AMD_pData) #Full model = variable of interest + adjustment variables (co-variates).
mod0 <- model.matrix(~as.factor(Sample_Group) + as.factor(Sex) + as.factor(Slide) + as.factor(Array), data = AMD_pData) #Null model = adjustment variables only.
identical(rownames(mod), colnames(M_values.funnorm.filt)) #TRUE.## [1] TRUE
identical(rownames(mod0), colnames(M_values.funnorm.filt)) #TRUE.## [1] TRUE
#Check how many SVs.
n.sv <- num.sv(M_values.funnorm.filt, mod, method = "leek")
sva.results.all <- sva(M_values.funnorm.filt, mod, mod0, n.sv) ## Number of significant surrogate variables is: 1
## Iteration (out of 5 ):1 2 3 4 5
#sva outputs four components: sv, pprob.gam, pprob.b, n.sv.
#sv = a matrix whose columns correspond to the estimated surrogate variables.
#pprob.gam = posterior probability that each CpG is associated with one or more latent variables.
#pprob.b = posterior probability that each CpG is associated with the variables of interest.
#n.sv = the number of surrogate variables estimated by the sva. NOTE: 1 significant SV.
#Load objects.
load("~/KoborLab/kobor_space/kendrix/macular_degeneration/data/AMD_gRSet.funnorm.filt.RData")
AMD_pData <- as.data.frame(pData(AMD_gRSet.funnorm.filt))
M_values.funnorm.filt <- getM(AMD_gRSet.funnorm.filt)
#SVA with Disease State as fixed variable.
mod <- model.matrix(~ as.numeric(Age) + as.factor(Sex) + as.factor(Slide) + as.factor(Array), data = AMD_pData) #Full model = variable of interest + adjustment variables (co-variates).
mod0 <- model.matrix(~ as.factor(Sex) + as.factor(Slide) + as.factor(Array), data = AMD_pData) #Null model = adjustment variables only.
identical(rownames(mod), colnames(M_values.funnorm.filt)) #TRUE.## [1] TRUE
identical(rownames(mod0), colnames(M_values.funnorm.filt)) #TRUE.## [1] TRUE
#Check how many SVs.
n.sv <- num.sv(M_values.funnorm.filt, mod, method = "leek")
sva.results.con <- sva(M_values.funnorm.filt, mod, mod0, n.sv) ## Number of significant surrogate variables is: 1
## Iteration (out of 5 ):1 2 3 4 5
#sva outputs four components: sv, pprob.gam, pprob.b, n.sv.
#sv = a matrix whose columns correspond to the estimated surrogate variables.
#pprob.gam = posterior probability that each CpG is associated with one or more latent variables.
#pprob.b = posterior probability that each CpG is associated with the variables of interest.
#n.sv = the number of surrogate variables estimated by the sva. NOTE: 1 significant SV.
#Load objects.
load("~/KoborLab/kobor_space/kendrix/macular_degeneration/data/AMD_gRSet.funnorm.filt.RData")
AMD_pData <- as.data.frame(pData(AMD_gRSet.funnorm.filt))
M_values.funnorm.filt <- getM(AMD_gRSet.funnorm.filt)
#SVA with Disease State as fixed variable.
mod <- model.matrix(~ as.numeric(Age) + as.factor(Slide) + as.factor(Array), data = AMD_pData) #Full model = variable of interest + adjustment variables (co-variates).
mod0 <- model.matrix(~ as.factor(Slide) + as.factor(Array), data = AMD_pData) #Null model = adjustment variables only.
identical(rownames(mod), colnames(M_values.funnorm.filt)) #TRUE.## [1] TRUE
identical(rownames(mod0), colnames(M_values.funnorm.filt)) #TRUE.## [1] TRUE
#Check how many SVs.
n.sv <- num.sv(M_values.funnorm.filt, mod, method = "leek")
sva.results.batch <- sva(M_values.funnorm.filt, mod, mod0, n.sv) ## Number of significant surrogate variables is: 1
## Iteration (out of 5 ):1 2 3 4 5
#sva outputs four components: sv, pprob.gam, pprob.b, n.sv.
#sv = a matrix whose columns correspond to the estimated surrogate variables.
#pprob.gam = posterior probability that each CpG is associated with one or more latent variables.
#pprob.b = posterior probability that each CpG is associated with the variables of interest.
#n.sv = the number of surrogate variables estimated by the sva. NOTE: 1 significant SV.
#Load objects.
load("~/KoborLab/kobor_space/kendrix/macular_degeneration/data/AMD_gRSet.funnorm.filt.RData")
AMD_pData <- as.data.frame(pData(AMD_gRSet.funnorm.filt))
M_values.funnorm.filt <- getM(AMD_gRSet.funnorm.filt)
#SVA with Disease State as fixed variable.
mod <- model.matrix(~ as.numeric(Age), data = AMD_pData) #Full model = variable of interest + adjustment variables (co-variates).
mod0 <- model.matrix(~ 1, data = AMD_pData) #Null model = adjustment variables only.
identical(rownames(mod), colnames(M_values.funnorm.filt)) #TRUE.## [1] TRUE
identical(rownames(mod0), colnames(M_values.funnorm.filt)) #TRUE.## [1] TRUE
#Check how many SVs.
n.sv <- num.sv(M_values.funnorm.filt, mod, method = "leek")
sva.results.age <- sva(M_values.funnorm.filt, mod, mod0, n.sv) ## Number of significant surrogate variables is: 1
## Iteration (out of 5 ):1 2 3 4 5
#sva outputs four components: sv, pprob.gam, pprob.b, n.sv.
#sv = a matrix whose columns correspond to the estimated surrogate variables.
#pprob.gam = posterior probability that each CpG is associated with one or more latent variables.
#pprob.b = posterior probability that each CpG is associated with the variables of interest.
#n.sv = the number of surrogate variables estimated by the sva. SVs <- cbind(sva.results.all$sv, sva.results.con$sv)
SVs <- cbind(SVs, sva.results.batch$sv)
SVs <- cbind(SVs, sva.results.age$sv)
SVs <- as.data.frame(SVs)
colnames(SVs) <- c("SV_All", "SV_Confounders", "SV_Batch", "SV_Only_Age")#Check for correlation between the SVs.
ggplot(SVs, aes(SV_All, SV_Batch)) + geom_point() + theme_classic()ggplot(SVs, aes(SV_All, SV_Confounders)) + geom_point() + theme_classic()ggplot(SVs, aes(SV_Batch, SV_Confounders)) + geom_point() + theme_classic()ggplot(SVs, aes(SV_Only_Age, SV_All)) + geom_point() + theme_classic()ggplot(SVs, aes(SV_Only_Age, SV_Batch)) + geom_point() + theme_classic()ggplot(SVs, aes(SV_Only_Age, SV_Confounders)) + geom_point() + theme_classic()load("~/KoborLab/kobor_space/kendrix/macular_degeneration/data/EpiDISH_celltype_prediction.RData")
#Add SV and cell type prediction to AMD_pData.
identical(rownames(AMD_pData), rownames(AMD_Epi_counts))## [1] TRUE
AMD_pData <- cbind(AMD_pData, SVs$SV_All)
AMD_pData <- cbind(AMD_pData, AMD_Epi_counts)
colnames(AMD_pData)[13] <- "SV"Only SV PCA.
meta <- as.data.frame(AMD_pData)
PCA_full <- princomp(M_values.funnorm.filt) #You can't have NAs in your dataframe - make sure to either remove probes with a lot of NAs and/or have imputed values. Can run na.omit but depending on the stage of pre-processing, could result in a lot of probes to be removed and a very inaccurate PCA.
Loadings <- as.data.frame(unclass(PCA_full$loadings))
vars <- PCA_full$sdev^2
Importance <- vars/sum(vars)
adjust <- 1-Importance[1]
pca_adjusted <- Importance[2:length(Importance)]/adjust
pca_df <- data.frame(adjusted_variance = pca_adjusted, PC = seq(1:length(pca_adjusted)))
#Restructure meta so that variables are in the appropriate format - categorical variables (sex is commonly labelled as 0 and 1) are factors and not numeric for example.
colnames(meta)[6] <- "Sentrix_Position"
colnames(meta)[7] <- "Sentrix_ID"
meta$Sentrix_ID <- as.factor(meta$Sentrix_ID)
meta$Sentrix_Position <- as.factor(meta$Sentrix_Position)
meta$Sex <- as.factor(meta$Sex)
meta$Sample_Group <- as.factor(meta$Sample_Group)
meta$Age <- as.numeric(meta$Age)
meta$SV <- as.numeric(meta$SV)
colnames(meta) #Obtain the column numbers to include.## [1] "Sample_Name" "Sample_Group" "Sex" "Age"
## [5] "Tissue" "Sentrix_Position" "Sentrix_ID" "Basename"
## [9] "filenames" "xMed" "yMed" "predictedSex"
## [13] "SV" "Epi" "Fib" "B"
## [17] "NK" "CD4T" "CD8T" "Mono"
## [21] "Neutro" "Eosino"
meta_categorical <- data.frame(meta[, c(2,3,6,7)]) #Input column numbers in meta that contain categorical variables.
meta_continuous <- data.frame(meta[, c(4,13)]) #Input column numbers in meta that contain continuous variables.
colnames(meta_categorical) #Write the line below to ensure you're changing the names of your variables in the right order.## [1] "Sample_Group" "Sex" "Sentrix_Position" "Sentrix_ID"
colnames(meta_categorical) <- c("Disease State", "Sex", "Row", "Chip")
colnames(meta_continuous)## [1] "Age" "SV"
colnames(meta_continuous) <- c("Age", "SV")
Order <- c(seq(1:sum(ncol(meta_categorical), ncol(meta_continuous))))
Num <- 16 #This number will depend on your dataset - if you only have 6 samples, you should probably only show 5 PCs or less.#Run PCA.
source("~/KoborLab/kobor_space/kendrix/R_Functions/heat_scree_plot.R", local = knitr::knit_global())
heat_scree_plot(Loadings, Importance, Num, Order)SV + Cell Type PCA.
meta <- as.data.frame(AMD_pData)
PCA_full <- princomp(M_values.funnorm.filt) #You can't have NAs in your dataframe - make sure to either remove probes with a lot of NAs and/or have imputed values. Can run na.omit but depending on the stage of pre-processing, could result in a lot of probes to be removed and a very inaccurate PCA.
Loadings <- as.data.frame(unclass(PCA_full$loadings))
vars <- PCA_full$sdev^2
Importance <- vars/sum(vars)
adjust <- 1-Importance[1]
pca_adjusted <- Importance[2:length(Importance)]/adjust
pca_df <- data.frame(adjusted_variance = pca_adjusted, PC = seq(1:length(pca_adjusted)))
#Restructure meta so that variables are in the appropriate format - categorical variables (sex is commonly labelled as 0 and 1) are factors and not numeric for example.
colnames(meta)[6] <- "Sentrix_Position"
colnames(meta)[7] <- "Sentrix_ID"
meta$Sentrix_ID <- as.factor(meta$Sentrix_ID)
meta$Sentrix_Position <- as.factor(meta$Sentrix_Position)
meta$Sex <- as.factor(meta$Sex)
meta$Sample_Group <- as.factor(meta$Sample_Group)
meta$Age <- as.numeric(meta$Age)
meta$SV <- as.numeric(meta$SV)
meta$Epi <- as.numeric(meta$Epi)
meta$Fib <- as.numeric(meta$Fib)
meta$B <- as.numeric(meta$B)
meta$NK <- as.numeric(meta$NK)
meta$CD4T <- as.numeric(meta$CD4T)
meta$CD8T <- as.numeric(meta$CD8T)
meta$Mono <- as.numeric(meta$Mono)
meta$Neutro <- as.numeric(meta$Neutro)
meta$Eosino <- as.numeric(meta$Eosino)
colnames(meta) #Obtain the column numbers to include.## [1] "Sample_Name" "Sample_Group" "Sex" "Age"
## [5] "Tissue" "Sentrix_Position" "Sentrix_ID" "Basename"
## [9] "filenames" "xMed" "yMed" "predictedSex"
## [13] "SV" "Epi" "Fib" "B"
## [17] "NK" "CD4T" "CD8T" "Mono"
## [21] "Neutro" "Eosino"
meta_categorical <- data.frame(meta[, c(2,3,6,7)]) #Input column numbers in meta that contain categorical variables.
meta_continuous <- data.frame(meta[, c(4,13:18,20:22)]) #Input column numbers in meta that contain continuous variables.
colnames(meta_categorical) #Write the line below to ensure you're changing the names of your variables in the right order.## [1] "Sample_Group" "Sex" "Sentrix_Position" "Sentrix_ID"
colnames(meta_categorical) <- c("Disease State", "Sex", "Row", "Chip")
colnames(meta_continuous)## [1] "Age" "SV" "Epi" "Fib" "B" "NK" "CD4T" "Mono"
## [9] "Neutro" "Eosino"
colnames(meta_continuous) <- c("Age", "SV", "Epithelial", "Fibroblast", "B Cell", "NK Cell", "CD4T", "Monocyte", "Neutrophil", "Eosinophil")
Order <- c(seq(1:sum(ncol(meta_categorical), ncol(meta_continuous))))
Num <- 16 #This number will depend on your dataset - if you only have 6 samples, you should probably only show 5 PCs or less.#Run PCA.
source("~/KoborLab/kobor_space/kendrix/R_Functions/heat_scree_plot.R", local = knitr::knit_global())
heat_scree_plot(Loadings, Importance, Num, Order)#Clean up column names of AMD_pData.
colnames(AMD_pData)[2] <- "Disease_State"
colnames(AMD_pData)[6:7] <- c("Row", "Chip")
colnames(AMD_pData)[14:22] <- c("Epithelial", "Fibroblast", "B_Cell", "NK_Cell", "CD4T", "CD8T", "Monocyte", "Neutrophil", "Eosinophil")
save(AMD_pData, M_values.funnorm.filt, file = "~/KoborLab/kobor_space/kendrix/macular_degeneration/data/AMD_SVA_M_values.funnorm.RData")Look for correlation between variables in the metadata. There might be a possibility that Age and Sex are confounded with Batch.
#Load objects.
load("~/KoborLab/kobor_space/kendrix/macular_degeneration/data/AMD_SVA_M_values.funnorm.RData")
#Subset the biological and technical variables to check correlation.
colnames(AMD_pData) #Check columns to subset.## [1] "Sample_Name" "Disease_State" "Sex" "Age"
## [5] "Tissue" "Row" "Chip" "Basename"
## [9] "filenames" "xMed" "yMed" "predictedSex"
## [13] "SV" "Epithelial" "Fibroblast" "B_Cell"
## [17] "NK_Cell" "CD4T" "CD8T" "Monocyte"
## [21] "Neutrophil" "Eosinophil"
meta.cor <- AMD_pData[, c(2:4, 6:7, 13:18, 20:22)] #I didn't include tissue because they are all the same tissue.
#Turn categorical variables into numerical variables.
meta.cor$Disease_State <- as.numeric(unlist(as.factor(meta.cor$Disease_State)))
meta.cor$Sex <- as.numeric((unlist(as.factor(meta.cor$Sex))))
meta.cor$Age <- as.numeric(meta.cor$Age)
meta.cor$Row <- as.numeric(unlist(as.factor(meta.cor$Row)))
meta.cor$Chip <- as.numeric(unlist(as.factor(meta.cor$Chip)))
str(meta.cor)## 'data.frame': 44 obs. of 14 variables:
## $ Disease_State: num 2 2 2 1 1 1 2 1 1 1 ...
## $ Sex : num 2 2 2 2 2 1 2 2 1 2 ...
## $ Age : num 61 74 70 76 79 89 66 70 83 76 ...
## $ Row : num 3 10 12 1 3 5 7 9 11 2 ...
## $ Chip : num 2 2 2 4 4 4 4 4 4 4 ...
## $ SV : num -0.1222 -0.011 -0.1074 -0.073 -0.0265 ...
## $ Epithelial : num 0.387 0.436 0.367 0.413 0.387 ...
## $ Fibroblast : num 0.326 0.316 0.298 0.24 0.312 ...
## $ B_Cell : num 0.0502 0.0454 0.0582 0.0528 0.0482 ...
## $ NK_Cell : num 0.065 0.0601 0.0816 0.0986 0.0743 ...
## $ CD4T : num 0.0478 0.0467 0.0545 0.0658 0.0544 ...
## $ Monocyte : num 0.0685 0.05 0.081 0.0666 0.0651 ...
## $ Neutrophil : num 0 0 0 0 0 ...
## $ Eosinophil : num 0.0555 0.0457 0.0597 0.0632 0.0597 ...
#Metadata correlation.
colnames(meta.cor) #Choose variables to check.## [1] "Disease_State" "Sex" "Age" "Row"
## [5] "Chip" "SV" "Epithelial" "Fibroblast"
## [9] "B_Cell" "NK_Cell" "CD4T" "Monocyte"
## [13] "Neutrophil" "Eosinophil"
meta.SV.celltype <- meta.cor[, c(6:14)]
grey <- colorRampPalette(brewer.pal(n = 9, "BrBG"))
heatmap.2(cor(meta.SV.celltype, use = "pairwise.complete.obs", method = "spearman"), cexCol = 1.5, cexRow = 1.5, col = grey, dendrogram = "both", scale = "none", margins = c(8,8), trace = "none", keysize = 1.3)#Metadata correlation.
colnames(meta.cor) #Choose variables to check.## [1] "Disease_State" "Sex" "Age" "Row"
## [5] "Chip" "SV" "Epithelial" "Fibroblast"
## [9] "B_Cell" "NK_Cell" "CD4T" "Monocyte"
## [13] "Neutrophil" "Eosinophil"
meta.all <- meta.cor[, c(1:7)]
grey <- colorRampPalette(brewer.pal(n = 9, "BrBG"))
heatmap.2(cor(meta.all, use = "pairwise.complete.obs", method = "spearman"), cexCol = 1.5, cexRow = 1.5, col = grey, dendrogram = "both", scale = "none", margins = c(10,10), trace = "none", keysize = 1.3)meta <- as.data.frame(AMD_pData)
PCA_full <- princomp(M_values.funnorm.filt) #You can't have NAs in your dataframe - make sure to either remove probes with a lot of NAs and/or have imputed values. Can run na.omit but depending on the stage of pre-processing, could result in a lot of probes to be removed and a very inaccurate PCA.
Loadings <- as.data.frame(unclass(PCA_full$loadings))
vars <- PCA_full$sdev^2
Importance <- vars/sum(vars)
adjust <- 1-Importance[1]
pca_adjusted <- Importance[2:length(Importance)]/adjust
pca_df <- data.frame(adjusted_variance = pca_adjusted, PC = seq(1:length(pca_adjusted)))
#Restructure meta so that variables are in the appropriate format - categorical variables (sex is commonly labelled as 0 and 1) are factors and not numeric for example.
colnames(meta)[6] <- "Sentrix_Position"
colnames(meta)[7] <- "Sentrix_ID"
meta$Sentrix_ID <- as.factor(meta$Sentrix_ID)
meta$Sentrix_Position <- as.factor(meta$Sentrix_Position)
meta$Sex <- as.factor(meta$Sex)
meta$Disease_State <- as.factor(meta$Disease_State)
meta$Age <- as.numeric(meta$Age)
meta$SV <- as.numeric(meta$SV)
meta$Epithelial <- as.numeric(meta$Epithelial)
colnames(meta) #Obtain the column numbers to include.## [1] "Sample_Name" "Disease_State" "Sex" "Age"
## [5] "Tissue" "Sentrix_Position" "Sentrix_ID" "Basename"
## [9] "filenames" "xMed" "yMed" "predictedSex"
## [13] "SV" "Epithelial" "Fibroblast" "B_Cell"
## [17] "NK_Cell" "CD4T" "CD8T" "Monocyte"
## [21] "Neutrophil" "Eosinophil"
meta_categorical <- data.frame(meta[, c(2,3,6,7)]) #Input column numbers in meta that contain categorical variables.
meta_continuous <- data.frame(meta[, c(4,13,14)]) #Input column numbers in meta that contain continuous variables.
colnames(meta_categorical) #Write the line below to ensure you're changing the names of your variables in the right order.## [1] "Disease_State" "Sex" "Sentrix_Position" "Sentrix_ID"
colnames(meta_categorical) <- c("Disease State", "Sex", "Row", "Chip")
colnames(meta_continuous)## [1] "Age" "SV" "Epithelial"
Order <- c(seq(1:sum(ncol(meta_categorical), ncol(meta_continuous))))
Num <- 16 #This number will depend on your dataset - if you only have 6 samples, you should probably only show 5 PCs or less.#Run PCA.
source("~/KoborLab/kobor_space/kendrix/R_Functions/heat_scree_plot.R", local = knitr::knit_global())
heat_scree_plot(Loadings, Importance, Num, Order)#Age is confounded by chip.
ggplot(AMD_pData, aes(Chip, Age)) +
geom_boxplot() +
geom_jitter(aes(fill = Chip), position = position_jitterdodge(dodge.width = 0.75, jitter.width = 0.3)) +
xlab("Chip") +
scale_x_discrete(breaks = c("200723300084","200723300089","200723300090","200770460039"), labels = c("Chip084","Chip089","Chip090","Chip039")) +
theme_classic2() + theme(legend.position = "none")The chunks below are done in an attempt to replicate Porter et al.’s hits in their publication to check the integrity of the dataset.
#Load all the objects.
load("~/KoborLab/kobor_space/kendrix/macular_degeneration/data/AMD_SVA_M_values.funnorm.RData")
dim(AMD_pData) #44 samples.## [1] 44 22
dim(M_values.funnorm.filt) #425456 probes.## [1] 425456 44
AMD_pData$Disease_State <- as.factor(AMD_pData$Disease_State)
AMD_pData$Sex <- as.factor(AMD_pData$Sex)
AMD_pData$Row <- as.factor(AMD_pData$Row)
AMD_pData$Chip <- as.factor(AMD_pData$Chip)
AMD_pData$Age <- as.numeric(AMD_pData$Age)
str(AMD_pData)## 'data.frame': 44 obs. of 22 variables:
## $ Sample_Name : chr "Sample 1" "Sample 10" "Sample 11" "Sample 12" ...
## $ Disease_State: Factor w/ 2 levels "age-related macular degeneration",..: 2 2 2 1 1 1 2 1 1 1 ...
## $ Sex : Factor w/ 2 levels "F","M": 2 2 2 2 2 1 2 2 1 2 ...
## $ Age : num 61 74 70 76 79 89 66 70 83 76 ...
## $ Tissue : chr "pigmented layer of retina" "pigmented layer of retina" "pigmented layer of retina" "pigmented layer of retina" ...
## $ Row : Factor w/ 12 levels "R01C01","R01C02",..: 3 10 12 1 3 5 7 9 11 2 ...
## $ Chip : Factor w/ 4 levels "200723300084",..: 2 2 2 4 4 4 4 4 4 4 ...
## $ Basename : chr "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200723300089_R02C01" "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200723300089_R05C02" "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200723300089_R06C02" "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200770460039_R01C01" ...
## $ filenames : chr "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200723300089_R02C01" "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200723300089_R05C02" "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200723300089_R06C02" "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200770460039_R01C01" ...
## $ xMed : num 11.4 11.7 11.9 11.7 11.6 ...
## $ yMed : num 11.7 12 12.1 11.9 11.9 ...
## $ predictedSex : chr "M" "M" "M" "M" ...
## $ SV : num -0.1222 -0.011 -0.1074 -0.073 -0.0265 ...
## $ Epithelial : num 0.387 0.436 0.367 0.413 0.387 ...
## $ Fibroblast : num 0.326 0.316 0.298 0.24 0.312 ...
## $ B_Cell : num 0.0502 0.0454 0.0582 0.0528 0.0482 ...
## $ NK_Cell : num 0.065 0.0601 0.0816 0.0986 0.0743 ...
## $ CD4T : num 0.0478 0.0467 0.0545 0.0658 0.0544 ...
## $ CD8T : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Monocyte : num 0.0685 0.05 0.081 0.0666 0.0651 ...
## $ Neutrophil : num 0 0 0 0 0 ...
## $ Eosinophil : num 0.0555 0.0457 0.0597 0.0632 0.0597 ...
#Check order.
identical(rownames(AMD_pData), colnames(M_values.funnorm.filt)) #TRUE. ## [1] TRUE
#Sanity check - there should be no NAs or infinite numbers - which could be a result of logit transformation of 0 or 1 beta values.
all(complete.cases(M_values.funnorm.filt)) == "TRUE" #TRUE - meaning no NA or infinite numbers. ## [1] TRUE
library(pbapply) #Progress bar for apply functions.
#EWAS on Disease_State - All samples.
#LM: Need to use transformed M-values instead of beta values as it is more statistically sound.
Porter_LM_pval <- pbsapply(1:nrow(M_values.funnorm.filt), function(CpG){
meta <- AMD_pData
meta$Mval <- M_values.funnorm.filt[CpG,]
mod_Porter <- lm(Mval ~ Disease_State + Sex + Disease_State*Sex + Chip, data = meta)
coef(summary(mod_Porter))[2,4]}) #Returns nominal p-value for Disease State for model at each CpG.
head(Porter_LM_pval)
save(Porter_LM_pval, file = "~/KoborLab/kobor_space/kendrix/macular_degeneration/data/Porter_LM_pval.RData")load("~/KoborLab/kobor_space/kendrix/macular_degeneration/data/Porter_LM_pval.RData")
#Inspect p-value distribution for model.
pvalue_dist_Porter <- data.frame(CpG = rownames(M_values.funnorm.filt), Nominal_P = Porter_LM_pval)
ggplot(pvalue_dist_Porter, aes(Nominal_P)) +
geom_histogram(fill = "grey90", color = "black") +
theme_classic() + xlab("Nominal P Value") +
ylim(0, 20000) +
xlim(min(Porter_LM_pval), max(Porter_LM_pval))## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values (geom_bar).
#Not right-skewed. Distribution is a little even.
#Multiple test correction with FDR.
M_values.funnorm.filt <- as.data.frame(M_values.funnorm.filt)
Multi_test_corr_relaxed <- p.adjust(Porter_LM_pval, method = "fdr", n = length(Porter_LM_pval))
#Looking at FDR thresholds for hits:
dim(as.data.frame(M_values.funnorm.filt)[which(Multi_test_corr_relaxed <= 0.05),]) #0 at 0.05.## [1] 0 44
dim(as.data.frame(M_values.funnorm.filt)[which(Multi_test_corr_relaxed <= 0.1),]) #0 at 0.1.## [1] 0 44
dim(as.data.frame(M_values.funnorm.filt)[which(Multi_test_corr_relaxed <= 0.2),]) #1 at 0.2.## [1] 1 44
#Looking at top hits by nominal P:
pvalue_dist_Porter <- pvalue_dist_Porter[order(pvalue_dist_Porter$Nominal_P),]
head(pvalue_dist_Porter)## CpG Nominal_P
## 171012 cg07685601 3.761885e-07
## 152996 cg08744475 9.429167e-07
## 383678 cg19561908 2.657040e-06
## 386931 cg21116314 2.946301e-06
## 325595 cg16540262 3.344313e-06
## 232459 cg06113708 9.410994e-06
#Load 450K annotation data.
load("~/KoborLab/kobor_space/shared_coding_resource/Illumina_EPIC/EPIC_Annotation_Complete.RData")
head(EPIC_Annotation_Complete)## Name AddressA_ID
## cg07881041 cg07881041 0085713262
## cg18478105 cg18478105 0046761277
## cg23229610 cg23229610 0021717843
## cg03513874 cg03513874 0029622133
## cg09835024 cg09835024 0016745152
## cg05451842 cg05451842 0016681196
## AlleleA_ProbeSeq AddressB_ID
## cg07881041 CTACAAATACAACACCCTCAACCCATATTTCATATATTATCTCATTTAAC
## cg18478105 AAATAAATTTCACTCTCAAATCCCAATCTCATACAACAAAACAAAAACCA 0086644198
## cg23229610 ATAAAATTCTTTCCTTAAAAAACATTAAAACCAAAATAAACAAAAATTCC
## cg03513874 ACAATAAAATAATAAAATCCCATCACTACTTACCCTCCTTAAATAATATC
## cg09835024 AATAAACACCAACCCCAAACCAATCTCACTTTATTAAATTACAAAAATCA 0081631976
## cg05451842 CRTTCAAATACACTATAACCCRACTAAAAAAACCCCCAACAACCCAAAAC
## AlleleB_ProbeSeq
## cg07881041
## cg18478105 AAATAAATTTCGCTCTCAAATCCCAATCTCGTACGACGAAACGAAAACCG
## cg23229610
## cg03513874
## cg09835024 AATAAACGCCGACCCCGAACCGATCTCGCTTTATTAAATTACAAAAATCG
## cg05451842
## Infinium_Design_Type Next_Base Color_Channel
## cg07881041 II
## cg18478105 I C Grn
## cg23229610 II
## cg03513874 II
## cg09835024 I A Red
## cg05451842 II
## Forward_Sequence
## cg07881041 CTGCACGCCTACTGCAGGTGCAGCACCCTCAGCCCATGTTTCATGTATTATCTCATTTAA[CG]CATGTATCATCTCATTTAATGCATGCATTATCTCATTTAATTCTCACAACCCCTCAGGTG
## cg18478105 TCCCGTCTTACGGGATGGATTTCGCTCTCAGGTCCCAGTCTCGTGCGGCGGGGCGGGGAC[CG]CAGCCGGCTGGGCGGGGAAGCCCTGAGCCGGGGAAGTCACGTGGGGCGTGTCCGGAGGCG
## cg23229610 GTTTCTGGACAGTAAAATTCTTTCCTTGAAGGACATTAGGGCCAAAATGGGCAAGGATTC[CG]AGATTGGTACATCGAGCGTTATCTTCCAACTCTCTTTTCTAAATGGGCTCATTTAGTAAT
## cg03513874 ATTGTGCCCACCTTGCTGCTGACAGTTAAGCATCACTAAAGTAGGAAATAGGGTCCAAAC[CG]ACACTACTTAAGGAGGGCAAGTAGTGATGGGACCTCATCATCCCATTGCTATCATGGAGC
## cg09835024 AGCCCCGTCATAGGTGGGCGCCGACCCCGAGCCGATCTCGCTTTATTAAATTACAGAAAT[CG]GTATTCAAAAAAAAAAAAAAAAAAGGGCGGGGAGGACACTCCCTCTTCTCTGTTCCCACA
## cg05451842 CACAGCGTGGATGCCCCGATTTCCCAGGTCCCTCCGCAACCCTCAGTAGAACTCCCACCG[CG]CCCTGGGCTGCTGGGGGCCTCCCCAGCCGGGTCACAGTGCACCTGAACGCCCCGGTCCGT
## Genome_Build CHR MAPINFO
## cg07881041 37 19 5236016
## cg18478105 37 20 61847650
## cg23229610 37 1 6841125
## cg03513874 37 2 198303466
## cg09835024 37 X 24072640
## cg05451842 37 14 93581139
## SourceSeq Strand
## cg07881041 TGCAGGTGCAGCACCCTCAGCCCATGTTTCATGTATTATCTCATTTAACG R
## cg18478105 CGGTCCCCGCCCCGCCGCACGAGACTGGGACCTGAGAGCGAAATCCATCC R
## cg23229610 CGGAATCCTTGCCCATTTTGGCCCTAATGTCCTTCAAGGAAAGAATTTTA R
## cg03513874 CAATGGGATGATGAGGTCCCATCACTACTTGCCCTCCTTAAGTAGTGTCG F
## cg09835024 GGTGGGCGCCGACCCCGAGCCGATCTCGCTTTATTAAATTACAGAAATCG R
## cg05451842 CGCCCTGGGCTGCTGGGGGCCTCCCCAGCCGGGTCACAGTGCACCTGAAC F
## UCSC_RefGene_Name UCSC_RefGene_Accession
## cg07881041 PTPRS;PTPRS;PTPRS;PTPRS NM_130855;NM_002850;NM_130854;NM_130853
## cg18478105 YTHDF1 NM_017798
## cg23229610
## cg03513874
## cg09835024 EIF2S3 NM_001415
## cg05451842 ITPK1;ITPK1;ITPK1 NM_001142593;NM_014216;NM_001142594
## UCSC_RefGene_Group UCSC_CpG_Islands_Name
## cg07881041 Body;Body;Body;Body chr19:5237294-5237669
## cg18478105 TSS200 chr20:61846843-61848103
## cg23229610 chr1:6844313-6846366
## cg03513874 chr2:198299244-198299972
## cg09835024 TSS1500 chrX:24072558-24073135
## cg05451842 Body;Body;Body chr14:93581083-93582797
## Relation_to_UCSC_CpG_Island Phantom4_Enhancers Phantom5_Enhancers
## cg07881041 N_Shore
## cg18478105 Island
## cg23229610 N_Shelf
## cg03513874 S_Shelf
## cg09835024 Island
## cg05451842 Island
## DMR X450k_Enhancer HMM_Island Regulatory_Feature_Name
## cg07881041 NA
## cg18478105 NA 20:61317142-61318498 20:61846284-61847956
## cg23229610 NA
## cg03513874 NA
## cg09835024 NA X:24071907-24073667
## cg05451842 NA 14:92650663-92652544
## Regulatory_Feature_Group GencodeBasicV12_NAME
## cg07881041
## cg18478105 Promoter_Associated YTHDF1;YTHDF1
## cg23229610
## cg03513874
## cg09835024 Promoter_Associated EIF2S3
## cg05451842 ITPK1
## GencodeBasicV12_Accession GencodeBasicV12_Group
## cg07881041
## cg18478105 ENST00000370334.4;ENST00000370339.3 TSS200;TSS200
## cg23229610
## cg03513874
## cg09835024 ENST00000253039.4 TSS200
## cg05451842 ENST00000555495.1 5'UTR
## GencodeCompV12_NAME
## cg07881041
## cg18478105 YTHDF1;YTHDF1
## cg23229610
## cg03513874
## cg09835024 EIF2S3;EIF2S3;EIF2S3
## cg05451842 ITPK1
## GencodeCompV12_Accession
## cg07881041
## cg18478105 ENST00000370334.4;ENST00000370339.3
## cg23229610
## cg03513874
## cg09835024 ENST00000487075.1;ENST00000423068.1;ENST00000253039.4
## cg05451842 ENST00000555495.1
## GencodeCompV12_Group DNase_Hypersensitivity_NAME
## cg07881041
## cg18478105 TSS200;TSS200 chr20:61847520-61847755
## cg23229610
## cg03513874
## cg09835024 TSS1500;TSS1500;TSS200 chrX:24072600-24073395
## cg05451842 5'UTR chr14:93581080-93581375
## DNase_Hypersensitivity_Evidence_Count OpenChromatin_NAME
## cg07881041 NA
## cg18478105 3
## cg23229610 NA
## cg03513874 NA
## cg09835024 3
## cg05451842 3
## OpenChromatin_Evidence_Count TFBS_NAME TFBS_Evidence_Count
## cg07881041 NA NA
## cg18478105 NA NA
## cg23229610 NA NA
## cg03513874 NA NA
## cg09835024 NA NA
## cg05451842 NA NA
## Methyl27_Loci Methyl450_Loci Chromosome_36 Coordinate_36
## cg07881041 NA TRUE 19 5187016
## cg18478105 NA TRUE 20 61318095
## cg23229610 NA TRUE 1 6763712
## cg03513874 NA TRUE 2 198011711
## cg09835024 NA TRUE X 23982561
## cg05451842 NA TRUE 14 92650892
## SNP_ID SNP_DISTANCE SNP_MinorAlleleFrequency
## cg07881041 rs187313142 18 0.000200
## cg18478105 rs549944121 5 0.001797
## cg23229610 rs545824288;rs527255711 40;12 0.000200;0.001198
## cg03513874
## cg09835024
## cg05451842 rs550745821 22 0.000200
## Random_Loci X strand Probe_rs Probe_maf CpG_rs CpG_maf SBE_rs
## cg07881041 NA NA - <NA> NA <NA> NA <NA>
## cg18478105 NA NA - <NA> NA <NA> NA <NA>
## cg23229610 NA NA - <NA> NA <NA> NA <NA>
## cg03513874 NA NA + <NA> NA <NA> NA <NA>
## cg09835024 NA NA - <NA> NA <NA> NA <NA>
## cg05451842 NA NA + <NA> NA <NA> NA <NA>
## SBE_maf CH_450_XY CH_450_Aut CH_EPIC Cross_Hyb
## cg07881041 NA No No No No
## cg18478105 NA No No No No
## cg23229610 NA No No No No
## cg03513874 NA No No No No
## cg09835024 NA No No No No
## cg05451842 NA No No No No
dim(hits_CpGs <- pvalue_dist_Porter[which(pvalue_dist_Porter$Nominal_P < 1e-6),]) #2 hits.## [1] 2 2
hits <- EPIC_Annotation_Complete[which(EPIC_Annotation_Complete$Name%in%hits_CpGs$CpG),]
hits$UCSC_RefGene_Name## [1] MAD1L1;MAD1L1;MAD1L1
## 66070 Levels: A1BG A1BG-AS1;A1BG A1BG-AS1;A1BG;ZNF497;ZNF497 ... ZZZ3;ZZZ3;ZZZ3
#Delta beta.
#Using Maggie's code for deltabeta:
deltabeta <- function(df, mainvar, covar1 = NULL, covar2 = NULL, covar3 = NULL, covar4 = NULL, covar5 = NULL) {
# Calculating delta beta of the main variable of interest (mainvar), with up to 5 possible covariates (covar)
# mainvar should be a vector of continuous variable
# all covars should also be vectors
# df = dataframe or matrix of beta values
# output is a vector of delta beta values
sd=sd(mainvar)
qt <-
range <- max(mainvar, na.rm = T) - min(mainvar, na.rm = T)
dB <- vector(mode = "numeric", length = nrow(df))
names(dB) <- rownames(df)
for (i in 1:nrow(df)) {
beta <- df[i, ]
if (is.null(covar1)) {
mod <- lm(beta ~ mainvar)
} else if (is.null(covar2)) {
mod <- lm(beta ~ mainvar + covar1)
} else if (is.null(covar3)) {
mod <- lm(beta ~ mainvar + covar1 + covar2)
} else if (is.null(covar4)) {
mod <- lm(beta ~ mainvar + covar1 + covar2 + covar3)
} else if (is.null(covar5)) {
mod <- lm(beta ~ mainvar + covar1 + covar2 + covar3 + covar4)
} else {
mod <- lm(beta ~ mainvar + covar1 + covar2 + covar3 + covar4 + covar5)
}
slope <- mod$coefficients[2]
dB[i] <- as.numeric(slope*range)
}
dB
}
betas.funnorm.filt <- m2beta(M_values.funnorm.filt)
delta_beta_Porter_fixed <- deltabeta(as.matrix(betas.funnorm.filt), as.numeric(as.factor(AMD_pData$Disease_State)),
covar1 = as.numeric(as.factor(AMD_pData$Sex)),
covar2 = as.numeric(as.factor(AMD_pData$Disease_State))*as.numeric(as.factor(AMD_pData$Sex)),
covar3 = as.numeric(as.factor(AMD_pData$Chip)))
length(delta_beta_Porter_fixed)
summary(delta_beta_Porter_fixed)
save(delta_beta_Porter_fixed, file = "~/KoborLab/kobor_space/kendrix/macular_degeneration/data/DB_Porter_fixed.RData")##3. Linear Model: Volcano Plot
#Volcano to examine hits (for DB, see below chunks):
load("~/KoborLab/kobor_space/kendrix/macular_degeneration/data/DB_Porter_fixed.RData")
#Call Volcano (Nominal p Version, modified from Rachel's code):
source("/home/BCRICWH.LAN/dlin/KoborLab/kobor_space/cake/home/dlin/Volcano_DL_Nominal.R")
#After running the last 2 chunks, make a summary table with CpG, Nominal_P, FDR, and Delta Beta.
Porter_Table <- data.frame(rownames(M_values.funnorm.filt), Porter_LM_pval, Multi_test_corr_relaxed, delta_beta_Porter_fixed)
colnames(Porter_Table) = c("CpG", "Nominal_P", "FDR", "Delta_Beta")
identical(as.character(rownames(Porter_Table)), as.character(Porter_Table$CpG)) #TRUE.## [1] TRUE
#Looking at top hits quickly without considering DB:
head(Porter_Table[order(Porter_Table$Nominal_P),],10)## CpG Nominal_P FDR Delta_Beta
## cg07685601 cg07685601 3.761885e-07 0.1600517 -0.09740259
## cg08744475 cg08744475 9.429167e-07 0.2005848 -0.11530537
## cg19561908 cg19561908 2.657040e-06 0.2845716 -0.13768018
## cg21116314 cg21116314 2.946301e-06 0.2845716 -0.32142870
## cg16540262 cg16540262 3.344313e-06 0.2845716 -0.11951332
## cg06113708 cg06113708 9.410994e-06 0.6673273 -0.10939259
## cg20668447 cg20668447 1.244784e-05 0.7565727 -0.02217991
## cg24033103 cg24033103 1.466881e-05 0.7801166 -0.14165294
## cg02283787 cg02283787 2.191394e-05 0.9055315 -0.10045724
## cg09562539 cg09562539 2.708660e-05 0.9055315 0.04114078
##Setting a threshold of 0.05DB, 5e-6 Nominal P (scale to 0.60DB):
makeVolcano_nominal(Porter_Table$Nominal_P, Porter_Table$Delta_Beta, 0.05, 1e-6, "DNAm changes", 0.3) #at 5e-6: 7 Hypermethylated, 0 Hypomethylated## [1] "Hypermethylated: 0"
## [1] "Hypomethylated: 2"
## Warning: Removed 130 rows containing missing values (geom_point).
#What are these hits?
#First make an annotated table - load 450K manifest.
load("~/KoborLab/kobor_space/shared_coding_resource/Illumina_EPIC/EPIC_Annotation_Complete.RData")
head(EPIC_Annotation_Complete)## Name AddressA_ID
## cg07881041 cg07881041 0085713262
## cg18478105 cg18478105 0046761277
## cg23229610 cg23229610 0021717843
## cg03513874 cg03513874 0029622133
## cg09835024 cg09835024 0016745152
## cg05451842 cg05451842 0016681196
## AlleleA_ProbeSeq AddressB_ID
## cg07881041 CTACAAATACAACACCCTCAACCCATATTTCATATATTATCTCATTTAAC
## cg18478105 AAATAAATTTCACTCTCAAATCCCAATCTCATACAACAAAACAAAAACCA 0086644198
## cg23229610 ATAAAATTCTTTCCTTAAAAAACATTAAAACCAAAATAAACAAAAATTCC
## cg03513874 ACAATAAAATAATAAAATCCCATCACTACTTACCCTCCTTAAATAATATC
## cg09835024 AATAAACACCAACCCCAAACCAATCTCACTTTATTAAATTACAAAAATCA 0081631976
## cg05451842 CRTTCAAATACACTATAACCCRACTAAAAAAACCCCCAACAACCCAAAAC
## AlleleB_ProbeSeq
## cg07881041
## cg18478105 AAATAAATTTCGCTCTCAAATCCCAATCTCGTACGACGAAACGAAAACCG
## cg23229610
## cg03513874
## cg09835024 AATAAACGCCGACCCCGAACCGATCTCGCTTTATTAAATTACAAAAATCG
## cg05451842
## Infinium_Design_Type Next_Base Color_Channel
## cg07881041 II
## cg18478105 I C Grn
## cg23229610 II
## cg03513874 II
## cg09835024 I A Red
## cg05451842 II
## Forward_Sequence
## cg07881041 CTGCACGCCTACTGCAGGTGCAGCACCCTCAGCCCATGTTTCATGTATTATCTCATTTAA[CG]CATGTATCATCTCATTTAATGCATGCATTATCTCATTTAATTCTCACAACCCCTCAGGTG
## cg18478105 TCCCGTCTTACGGGATGGATTTCGCTCTCAGGTCCCAGTCTCGTGCGGCGGGGCGGGGAC[CG]CAGCCGGCTGGGCGGGGAAGCCCTGAGCCGGGGAAGTCACGTGGGGCGTGTCCGGAGGCG
## cg23229610 GTTTCTGGACAGTAAAATTCTTTCCTTGAAGGACATTAGGGCCAAAATGGGCAAGGATTC[CG]AGATTGGTACATCGAGCGTTATCTTCCAACTCTCTTTTCTAAATGGGCTCATTTAGTAAT
## cg03513874 ATTGTGCCCACCTTGCTGCTGACAGTTAAGCATCACTAAAGTAGGAAATAGGGTCCAAAC[CG]ACACTACTTAAGGAGGGCAAGTAGTGATGGGACCTCATCATCCCATTGCTATCATGGAGC
## cg09835024 AGCCCCGTCATAGGTGGGCGCCGACCCCGAGCCGATCTCGCTTTATTAAATTACAGAAAT[CG]GTATTCAAAAAAAAAAAAAAAAAAGGGCGGGGAGGACACTCCCTCTTCTCTGTTCCCACA
## cg05451842 CACAGCGTGGATGCCCCGATTTCCCAGGTCCCTCCGCAACCCTCAGTAGAACTCCCACCG[CG]CCCTGGGCTGCTGGGGGCCTCCCCAGCCGGGTCACAGTGCACCTGAACGCCCCGGTCCGT
## Genome_Build CHR MAPINFO
## cg07881041 37 19 5236016
## cg18478105 37 20 61847650
## cg23229610 37 1 6841125
## cg03513874 37 2 198303466
## cg09835024 37 X 24072640
## cg05451842 37 14 93581139
## SourceSeq Strand
## cg07881041 TGCAGGTGCAGCACCCTCAGCCCATGTTTCATGTATTATCTCATTTAACG R
## cg18478105 CGGTCCCCGCCCCGCCGCACGAGACTGGGACCTGAGAGCGAAATCCATCC R
## cg23229610 CGGAATCCTTGCCCATTTTGGCCCTAATGTCCTTCAAGGAAAGAATTTTA R
## cg03513874 CAATGGGATGATGAGGTCCCATCACTACTTGCCCTCCTTAAGTAGTGTCG F
## cg09835024 GGTGGGCGCCGACCCCGAGCCGATCTCGCTTTATTAAATTACAGAAATCG R
## cg05451842 CGCCCTGGGCTGCTGGGGGCCTCCCCAGCCGGGTCACAGTGCACCTGAAC F
## UCSC_RefGene_Name UCSC_RefGene_Accession
## cg07881041 PTPRS;PTPRS;PTPRS;PTPRS NM_130855;NM_002850;NM_130854;NM_130853
## cg18478105 YTHDF1 NM_017798
## cg23229610
## cg03513874
## cg09835024 EIF2S3 NM_001415
## cg05451842 ITPK1;ITPK1;ITPK1 NM_001142593;NM_014216;NM_001142594
## UCSC_RefGene_Group UCSC_CpG_Islands_Name
## cg07881041 Body;Body;Body;Body chr19:5237294-5237669
## cg18478105 TSS200 chr20:61846843-61848103
## cg23229610 chr1:6844313-6846366
## cg03513874 chr2:198299244-198299972
## cg09835024 TSS1500 chrX:24072558-24073135
## cg05451842 Body;Body;Body chr14:93581083-93582797
## Relation_to_UCSC_CpG_Island Phantom4_Enhancers Phantom5_Enhancers
## cg07881041 N_Shore
## cg18478105 Island
## cg23229610 N_Shelf
## cg03513874 S_Shelf
## cg09835024 Island
## cg05451842 Island
## DMR X450k_Enhancer HMM_Island Regulatory_Feature_Name
## cg07881041 NA
## cg18478105 NA 20:61317142-61318498 20:61846284-61847956
## cg23229610 NA
## cg03513874 NA
## cg09835024 NA X:24071907-24073667
## cg05451842 NA 14:92650663-92652544
## Regulatory_Feature_Group GencodeBasicV12_NAME
## cg07881041
## cg18478105 Promoter_Associated YTHDF1;YTHDF1
## cg23229610
## cg03513874
## cg09835024 Promoter_Associated EIF2S3
## cg05451842 ITPK1
## GencodeBasicV12_Accession GencodeBasicV12_Group
## cg07881041
## cg18478105 ENST00000370334.4;ENST00000370339.3 TSS200;TSS200
## cg23229610
## cg03513874
## cg09835024 ENST00000253039.4 TSS200
## cg05451842 ENST00000555495.1 5'UTR
## GencodeCompV12_NAME
## cg07881041
## cg18478105 YTHDF1;YTHDF1
## cg23229610
## cg03513874
## cg09835024 EIF2S3;EIF2S3;EIF2S3
## cg05451842 ITPK1
## GencodeCompV12_Accession
## cg07881041
## cg18478105 ENST00000370334.4;ENST00000370339.3
## cg23229610
## cg03513874
## cg09835024 ENST00000487075.1;ENST00000423068.1;ENST00000253039.4
## cg05451842 ENST00000555495.1
## GencodeCompV12_Group DNase_Hypersensitivity_NAME
## cg07881041
## cg18478105 TSS200;TSS200 chr20:61847520-61847755
## cg23229610
## cg03513874
## cg09835024 TSS1500;TSS1500;TSS200 chrX:24072600-24073395
## cg05451842 5'UTR chr14:93581080-93581375
## DNase_Hypersensitivity_Evidence_Count OpenChromatin_NAME
## cg07881041 NA
## cg18478105 3
## cg23229610 NA
## cg03513874 NA
## cg09835024 3
## cg05451842 3
## OpenChromatin_Evidence_Count TFBS_NAME TFBS_Evidence_Count
## cg07881041 NA NA
## cg18478105 NA NA
## cg23229610 NA NA
## cg03513874 NA NA
## cg09835024 NA NA
## cg05451842 NA NA
## Methyl27_Loci Methyl450_Loci Chromosome_36 Coordinate_36
## cg07881041 NA TRUE 19 5187016
## cg18478105 NA TRUE 20 61318095
## cg23229610 NA TRUE 1 6763712
## cg03513874 NA TRUE 2 198011711
## cg09835024 NA TRUE X 23982561
## cg05451842 NA TRUE 14 92650892
## SNP_ID SNP_DISTANCE SNP_MinorAlleleFrequency
## cg07881041 rs187313142 18 0.000200
## cg18478105 rs549944121 5 0.001797
## cg23229610 rs545824288;rs527255711 40;12 0.000200;0.001198
## cg03513874
## cg09835024
## cg05451842 rs550745821 22 0.000200
## Random_Loci X strand Probe_rs Probe_maf CpG_rs CpG_maf SBE_rs
## cg07881041 NA NA - <NA> NA <NA> NA <NA>
## cg18478105 NA NA - <NA> NA <NA> NA <NA>
## cg23229610 NA NA - <NA> NA <NA> NA <NA>
## cg03513874 NA NA + <NA> NA <NA> NA <NA>
## cg09835024 NA NA - <NA> NA <NA> NA <NA>
## cg05451842 NA NA + <NA> NA <NA> NA <NA>
## SBE_maf CH_450_XY CH_450_Aut CH_EPIC Cross_Hyb
## cg07881041 NA No No No No
## cg18478105 NA No No No No
## cg23229610 NA No No No No
## cg03513874 NA No No No No
## cg09835024 NA No No No No
## cg05451842 NA No No No No
Porter_Table.annotated = merge(Porter_Table, EPIC_Annotation_Complete[,c("Name", "CHR", "Strand", "UCSC_RefGene_Name", "UCSC_RefGene_Group")], by.x = "CpG", by.y = "Name", all = FALSE)
colnames(Porter_Table.annotated)[5:6] = c("Chromosome", "Coordinate")
Porter_Table.annotated <- Porter_Table.annotated[order(Porter_Table.annotated$Nominal_P),]
#Grabbing the Volcano hits:
LM_Porter_Hits <- Porter_Table.annotated[which(abs(Porter_Table.annotated$Delta_Beta)>0.05 & Porter_Table.annotated$Nominal_P<1e-6),]
#Let's order by Nominal_P:
LM_Porter_Hits = LM_Porter_Hits[order(LM_Porter_Hits$Nominal_P),]
rownames(LM_Porter_Hits) = c()
str(LM_Porter_Hits)## 'data.frame': 2 obs. of 8 variables:
## $ CpG : Factor w/ 425456 levels "cg00000029","cg00000108",..: 131396 148247
## $ Nominal_P : num 3.76e-07 9.43e-07
## $ FDR : num 0.16 0.201
## $ Delta_Beta : num -0.0974 -0.1153
## $ Chromosome : Factor w/ 25 levels "","1","10","11",..: 21 20
## $ Coordinate : Factor w/ 3 levels "","F","R": 3 2
## $ UCSC_RefGene_Name : Factor w/ 66070 levels "","A1BG","A1BG-AS1;A1BG",..: 33670 1
## $ UCSC_RefGene_Group: Factor w/ 8044 levels "","1stExon","1stExon;1stExon",..: 4267 1
head(LM_Porter_Hits)## CpG Nominal_P FDR Delta_Beta Chromosome Coordinate
## 1 cg07685601 3.761885e-07 0.1600517 -0.09740259 7 R
## 2 cg08744475 9.429167e-07 0.2005848 -0.11530537 6 F
## UCSC_RefGene_Name UCSC_RefGene_Group
## 1 MAD1L1;MAD1L1;MAD1L1 Body;Body;Body
## 2
No covariates.
#Load all the objects.
load("~/KoborLab/kobor_space/kendrix/macular_degeneration/data/AMD_SVA_M_values.funnorm.RData")
dim(AMD_pData) #44 samples.## [1] 44 22
dim(M_values.funnorm.filt) #425456 probes.## [1] 425456 44
AMD_pData$Disease_State <- as.factor(AMD_pData$Disease_State)
AMD_pData$Sex <- as.factor(AMD_pData$Sex)
AMD_pData$Row <- as.factor(AMD_pData$Row)
AMD_pData$Chip <- as.factor(AMD_pData$Chip)
AMD_pData$Age <- as.numeric(AMD_pData$Age)
str(AMD_pData)## 'data.frame': 44 obs. of 22 variables:
## $ Sample_Name : chr "Sample 1" "Sample 10" "Sample 11" "Sample 12" ...
## $ Disease_State: Factor w/ 2 levels "age-related macular degeneration",..: 2 2 2 1 1 1 2 1 1 1 ...
## $ Sex : Factor w/ 2 levels "F","M": 2 2 2 2 2 1 2 2 1 2 ...
## $ Age : num 61 74 70 76 79 89 66 70 83 76 ...
## $ Tissue : chr "pigmented layer of retina" "pigmented layer of retina" "pigmented layer of retina" "pigmented layer of retina" ...
## $ Row : Factor w/ 12 levels "R01C01","R01C02",..: 3 10 12 1 3 5 7 9 11 2 ...
## $ Chip : Factor w/ 4 levels "200723300084",..: 2 2 2 4 4 4 4 4 4 4 ...
## $ Basename : chr "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200723300089_R02C01" "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200723300089_R05C02" "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200723300089_R06C02" "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200770460039_R01C01" ...
## $ filenames : chr "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200723300089_R02C01" "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200723300089_R05C02" "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200723300089_R06C02" "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200770460039_R01C01" ...
## $ xMed : num 11.4 11.7 11.9 11.7 11.6 ...
## $ yMed : num 11.7 12 12.1 11.9 11.9 ...
## $ predictedSex : chr "M" "M" "M" "M" ...
## $ SV : num -0.1222 -0.011 -0.1074 -0.073 -0.0265 ...
## $ Epithelial : num 0.387 0.436 0.367 0.413 0.387 ...
## $ Fibroblast : num 0.326 0.316 0.298 0.24 0.312 ...
## $ B_Cell : num 0.0502 0.0454 0.0582 0.0528 0.0482 ...
## $ NK_Cell : num 0.065 0.0601 0.0816 0.0986 0.0743 ...
## $ CD4T : num 0.0478 0.0467 0.0545 0.0658 0.0544 ...
## $ CD8T : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Monocyte : num 0.0685 0.05 0.081 0.0666 0.0651 ...
## $ Neutrophil : num 0 0 0 0 0 ...
## $ Eosinophil : num 0.0555 0.0457 0.0597 0.0632 0.0597 ...
#Check order.
identical(rownames(AMD_pData), colnames(M_values.funnorm.filt)) #TRUE. ## [1] TRUE
#Sanity check - there should be no NAs or infinite numbers - which could be a result of logit transformation of 0 or 1 beta values.
all(complete.cases(M_values.funnorm.filt)) == "TRUE" #TRUE - meaning no NA or infinite numbers. ## [1] TRUE
library(pbapply) #Progress bar for apply functions.
#EWAS on Age - All samples.
#LM: Need to use transformed M-values instead of beta values as it is more statistically sound.
Age_LM_pval <- pbsapply(1:nrow(M_values.funnorm.filt), function(CpG){
meta <- AMD_pData
meta$Mval <- M_values.funnorm.filt[CpG,]
mod_Age <- lm(Mval ~ Age, data = meta)
coef(summary(mod_Age))[2,4]}) #Returns nominal p-value for Age for model at each CpG.
head(Age_LM_pval)
save(Age_LM_pval, file = "~/KoborLab/kobor_space/kendrix/macular_degeneration/data/Age_LM_pval.RData")load("~/KoborLab/kobor_space/kendrix/macular_degeneration/data/Age_LM_pval.RData")
#Inspect p-value distribution for model.
pvalue_dist_Age <- data.frame(CpG = rownames(M_values.funnorm.filt), Nominal_P = Age_LM_pval)
ggplot(pvalue_dist_Age, aes(Nominal_P)) +
geom_histogram(fill = "grey90", color = "black") +
theme_classic() + xlab("Nominal P Value") +
ylim(0, 20000) +
xlim(min(Age_LM_pval), max(Age_LM_pval))## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values (geom_bar).
#Not right-skewed. Distribution is a little even.
#Multiple test correction with FDR.
M_values.funnorm.filt <- as.data.frame(M_values.funnorm.filt)
Multi_test_corr_relaxed <- p.adjust(Age_LM_pval, method = "fdr", n = length(Age_LM_pval))
#Looking at FDR thresholds for hits:
dim(as.data.frame(M_values.funnorm.filt)[which(Multi_test_corr_relaxed <= 0.05),]) #1 at 0.05.## [1] 1 44
dim(as.data.frame(M_values.funnorm.filt)[which(Multi_test_corr_relaxed <= 0.1),]) #1 at 0.1.## [1] 1 44
dim(as.data.frame(M_values.funnorm.filt)[which(Multi_test_corr_relaxed <= 0.2),]) #8 at 0.2.## [1] 8 44
#Looking at top hits by nominal P:
pvalue_dist_Age <- pvalue_dist_Age[order(pvalue_dist_Age$Nominal_P),]
head(pvalue_dist_Age)## CpG Nominal_P
## 51300 cg22197050 9.782282e-08
## 347431 cg06596654 9.002000e-07
## 295662 cg02727104 1.154277e-06
## 50945 cg22166290 1.224729e-06
## 120455 cg09011833 1.648428e-06
## 112991 cg01594233 1.928795e-06
#Load 450K annotation data.
load("~/KoborLab/kobor_space/shared_coding_resource/Illumina_EPIC/EPIC_Annotation_Complete.RData")
head(EPIC_Annotation_Complete)## Name AddressA_ID
## cg07881041 cg07881041 0085713262
## cg18478105 cg18478105 0046761277
## cg23229610 cg23229610 0021717843
## cg03513874 cg03513874 0029622133
## cg09835024 cg09835024 0016745152
## cg05451842 cg05451842 0016681196
## AlleleA_ProbeSeq AddressB_ID
## cg07881041 CTACAAATACAACACCCTCAACCCATATTTCATATATTATCTCATTTAAC
## cg18478105 AAATAAATTTCACTCTCAAATCCCAATCTCATACAACAAAACAAAAACCA 0086644198
## cg23229610 ATAAAATTCTTTCCTTAAAAAACATTAAAACCAAAATAAACAAAAATTCC
## cg03513874 ACAATAAAATAATAAAATCCCATCACTACTTACCCTCCTTAAATAATATC
## cg09835024 AATAAACACCAACCCCAAACCAATCTCACTTTATTAAATTACAAAAATCA 0081631976
## cg05451842 CRTTCAAATACACTATAACCCRACTAAAAAAACCCCCAACAACCCAAAAC
## AlleleB_ProbeSeq
## cg07881041
## cg18478105 AAATAAATTTCGCTCTCAAATCCCAATCTCGTACGACGAAACGAAAACCG
## cg23229610
## cg03513874
## cg09835024 AATAAACGCCGACCCCGAACCGATCTCGCTTTATTAAATTACAAAAATCG
## cg05451842
## Infinium_Design_Type Next_Base Color_Channel
## cg07881041 II
## cg18478105 I C Grn
## cg23229610 II
## cg03513874 II
## cg09835024 I A Red
## cg05451842 II
## Forward_Sequence
## cg07881041 CTGCACGCCTACTGCAGGTGCAGCACCCTCAGCCCATGTTTCATGTATTATCTCATTTAA[CG]CATGTATCATCTCATTTAATGCATGCATTATCTCATTTAATTCTCACAACCCCTCAGGTG
## cg18478105 TCCCGTCTTACGGGATGGATTTCGCTCTCAGGTCCCAGTCTCGTGCGGCGGGGCGGGGAC[CG]CAGCCGGCTGGGCGGGGAAGCCCTGAGCCGGGGAAGTCACGTGGGGCGTGTCCGGAGGCG
## cg23229610 GTTTCTGGACAGTAAAATTCTTTCCTTGAAGGACATTAGGGCCAAAATGGGCAAGGATTC[CG]AGATTGGTACATCGAGCGTTATCTTCCAACTCTCTTTTCTAAATGGGCTCATTTAGTAAT
## cg03513874 ATTGTGCCCACCTTGCTGCTGACAGTTAAGCATCACTAAAGTAGGAAATAGGGTCCAAAC[CG]ACACTACTTAAGGAGGGCAAGTAGTGATGGGACCTCATCATCCCATTGCTATCATGGAGC
## cg09835024 AGCCCCGTCATAGGTGGGCGCCGACCCCGAGCCGATCTCGCTTTATTAAATTACAGAAAT[CG]GTATTCAAAAAAAAAAAAAAAAAAGGGCGGGGAGGACACTCCCTCTTCTCTGTTCCCACA
## cg05451842 CACAGCGTGGATGCCCCGATTTCCCAGGTCCCTCCGCAACCCTCAGTAGAACTCCCACCG[CG]CCCTGGGCTGCTGGGGGCCTCCCCAGCCGGGTCACAGTGCACCTGAACGCCCCGGTCCGT
## Genome_Build CHR MAPINFO
## cg07881041 37 19 5236016
## cg18478105 37 20 61847650
## cg23229610 37 1 6841125
## cg03513874 37 2 198303466
## cg09835024 37 X 24072640
## cg05451842 37 14 93581139
## SourceSeq Strand
## cg07881041 TGCAGGTGCAGCACCCTCAGCCCATGTTTCATGTATTATCTCATTTAACG R
## cg18478105 CGGTCCCCGCCCCGCCGCACGAGACTGGGACCTGAGAGCGAAATCCATCC R
## cg23229610 CGGAATCCTTGCCCATTTTGGCCCTAATGTCCTTCAAGGAAAGAATTTTA R
## cg03513874 CAATGGGATGATGAGGTCCCATCACTACTTGCCCTCCTTAAGTAGTGTCG F
## cg09835024 GGTGGGCGCCGACCCCGAGCCGATCTCGCTTTATTAAATTACAGAAATCG R
## cg05451842 CGCCCTGGGCTGCTGGGGGCCTCCCCAGCCGGGTCACAGTGCACCTGAAC F
## UCSC_RefGene_Name UCSC_RefGene_Accession
## cg07881041 PTPRS;PTPRS;PTPRS;PTPRS NM_130855;NM_002850;NM_130854;NM_130853
## cg18478105 YTHDF1 NM_017798
## cg23229610
## cg03513874
## cg09835024 EIF2S3 NM_001415
## cg05451842 ITPK1;ITPK1;ITPK1 NM_001142593;NM_014216;NM_001142594
## UCSC_RefGene_Group UCSC_CpG_Islands_Name
## cg07881041 Body;Body;Body;Body chr19:5237294-5237669
## cg18478105 TSS200 chr20:61846843-61848103
## cg23229610 chr1:6844313-6846366
## cg03513874 chr2:198299244-198299972
## cg09835024 TSS1500 chrX:24072558-24073135
## cg05451842 Body;Body;Body chr14:93581083-93582797
## Relation_to_UCSC_CpG_Island Phantom4_Enhancers Phantom5_Enhancers
## cg07881041 N_Shore
## cg18478105 Island
## cg23229610 N_Shelf
## cg03513874 S_Shelf
## cg09835024 Island
## cg05451842 Island
## DMR X450k_Enhancer HMM_Island Regulatory_Feature_Name
## cg07881041 NA
## cg18478105 NA 20:61317142-61318498 20:61846284-61847956
## cg23229610 NA
## cg03513874 NA
## cg09835024 NA X:24071907-24073667
## cg05451842 NA 14:92650663-92652544
## Regulatory_Feature_Group GencodeBasicV12_NAME
## cg07881041
## cg18478105 Promoter_Associated YTHDF1;YTHDF1
## cg23229610
## cg03513874
## cg09835024 Promoter_Associated EIF2S3
## cg05451842 ITPK1
## GencodeBasicV12_Accession GencodeBasicV12_Group
## cg07881041
## cg18478105 ENST00000370334.4;ENST00000370339.3 TSS200;TSS200
## cg23229610
## cg03513874
## cg09835024 ENST00000253039.4 TSS200
## cg05451842 ENST00000555495.1 5'UTR
## GencodeCompV12_NAME
## cg07881041
## cg18478105 YTHDF1;YTHDF1
## cg23229610
## cg03513874
## cg09835024 EIF2S3;EIF2S3;EIF2S3
## cg05451842 ITPK1
## GencodeCompV12_Accession
## cg07881041
## cg18478105 ENST00000370334.4;ENST00000370339.3
## cg23229610
## cg03513874
## cg09835024 ENST00000487075.1;ENST00000423068.1;ENST00000253039.4
## cg05451842 ENST00000555495.1
## GencodeCompV12_Group DNase_Hypersensitivity_NAME
## cg07881041
## cg18478105 TSS200;TSS200 chr20:61847520-61847755
## cg23229610
## cg03513874
## cg09835024 TSS1500;TSS1500;TSS200 chrX:24072600-24073395
## cg05451842 5'UTR chr14:93581080-93581375
## DNase_Hypersensitivity_Evidence_Count OpenChromatin_NAME
## cg07881041 NA
## cg18478105 3
## cg23229610 NA
## cg03513874 NA
## cg09835024 3
## cg05451842 3
## OpenChromatin_Evidence_Count TFBS_NAME TFBS_Evidence_Count
## cg07881041 NA NA
## cg18478105 NA NA
## cg23229610 NA NA
## cg03513874 NA NA
## cg09835024 NA NA
## cg05451842 NA NA
## Methyl27_Loci Methyl450_Loci Chromosome_36 Coordinate_36
## cg07881041 NA TRUE 19 5187016
## cg18478105 NA TRUE 20 61318095
## cg23229610 NA TRUE 1 6763712
## cg03513874 NA TRUE 2 198011711
## cg09835024 NA TRUE X 23982561
## cg05451842 NA TRUE 14 92650892
## SNP_ID SNP_DISTANCE SNP_MinorAlleleFrequency
## cg07881041 rs187313142 18 0.000200
## cg18478105 rs549944121 5 0.001797
## cg23229610 rs545824288;rs527255711 40;12 0.000200;0.001198
## cg03513874
## cg09835024
## cg05451842 rs550745821 22 0.000200
## Random_Loci X strand Probe_rs Probe_maf CpG_rs CpG_maf SBE_rs
## cg07881041 NA NA - <NA> NA <NA> NA <NA>
## cg18478105 NA NA - <NA> NA <NA> NA <NA>
## cg23229610 NA NA - <NA> NA <NA> NA <NA>
## cg03513874 NA NA + <NA> NA <NA> NA <NA>
## cg09835024 NA NA - <NA> NA <NA> NA <NA>
## cg05451842 NA NA + <NA> NA <NA> NA <NA>
## SBE_maf CH_450_XY CH_450_Aut CH_EPIC Cross_Hyb
## cg07881041 NA No No No No
## cg18478105 NA No No No No
## cg23229610 NA No No No No
## cg03513874 NA No No No No
## cg09835024 NA No No No No
## cg05451842 NA No No No No
dim(hits_CpGs <- pvalue_dist_Age[which(pvalue_dist_Age$Nominal_P < 1e-6),]) #2 hits.## [1] 2 2
hits <- EPIC_Annotation_Complete[which(EPIC_Annotation_Complete$Name%in%hits_CpGs$CpG),]
hits$UCSC_RefGene_Name## [1] LOC100132215
## 66070 Levels: A1BG A1BG-AS1;A1BG A1BG-AS1;A1BG;ZNF497;ZNF497 ... ZZZ3;ZZZ3;ZZZ3
#Delta beta.
#Using Maggie's code for deltabeta:
deltabeta <- function(df, mainvar, covar1 = NULL, covar2 = NULL, covar3 = NULL, covar4 = NULL, covar5 = NULL) {
# Calculating delta beta of the main variable of interest (mainvar), with up to 5 possible covariates (covar)
# mainvar should be a vector of continuous variable
# all covars should also be vectors
# df = dataframe or matrix of beta values
# output is a vector of delta beta values
sd=sd(mainvar)
qt <-
range <- max(mainvar, na.rm = T) - min(mainvar, na.rm = T)
dB <- vector(mode = "numeric", length = nrow(df))
names(dB) <- rownames(df)
for (i in 1:nrow(df)) {
beta <- df[i, ]
if (is.null(covar1)) {
mod <- lm(beta ~ mainvar)
} else if (is.null(covar2)) {
mod <- lm(beta ~ mainvar + covar1)
} else if (is.null(covar3)) {
mod <- lm(beta ~ mainvar + covar1 + covar2)
} else if (is.null(covar4)) {
mod <- lm(beta ~ mainvar + covar1 + covar2 + covar3)
} else if (is.null(covar5)) {
mod <- lm(beta ~ mainvar + covar1 + covar2 + covar3 + covar4)
} else {
mod <- lm(beta ~ mainvar + covar1 + covar2 + covar3 + covar4 + covar5)
}
slope <- mod$coefficients[2]
dB[i] <- as.numeric(slope*range)
}
dB
}
betas.funnorm.filt <- m2beta(M_values.funnorm.filt)
delta_beta_Age_fixed <- deltabeta(as.matrix(betas.funnorm.filt), AMD_pData$Age)
length(delta_beta_Age_fixed)
summary(delta_beta_Age_fixed)
save(delta_beta_Age_fixed, file = "~/KoborLab/kobor_space/kendrix/macular_degeneration/data/DB_Age_fixed.RData")##3. Linear Model: Volcano Plot
#Volcano to examine hits (for DB, see below chunks):
load("~/KoborLab/kobor_space/kendrix/macular_degeneration/data/DB_Age_fixed.RData")
#Call Volcano (Nominal p Version, modified from Rachel's code):
source("/home/BCRICWH.LAN/dlin/KoborLab/kobor_space/cake/home/dlin/Volcano_DL_Nominal.R")
#After running the last 2 chunks, make a summary table with CpG, Nominal_P, FDR, and Delta Beta.
Age_Table <- data.frame(rownames(M_values.funnorm.filt), Age_LM_pval, Multi_test_corr_relaxed, delta_beta_Age_fixed)
colnames(Age_Table) = c("CpG", "Nominal_P", "FDR", "Delta_Beta")
identical(as.character(rownames(Age_Table)), as.character(Age_Table$CpG)) #TRUE.## [1] TRUE
#Looking at top hits quickly without considering DB:
head(Age_Table[order(Age_Table$Nominal_P),],10)## CpG Nominal_P FDR Delta_Beta
## cg22197050 cg22197050 9.782282e-08 0.04161931 0.07334226
## cg06596654 cg06596654 9.002000e-07 0.12583304 0.19294123
## cg02727104 cg02727104 1.154277e-06 0.12583304 0.12586465
## cg22166290 cg22166290 1.224729e-06 0.12583304 0.06480999
## cg09011833 cg09011833 1.648428e-06 0.12583304 0.04786280
## cg01594233 cg01594233 1.928795e-06 0.12583304 0.11625353
## cg23813012 cg23813012 2.248637e-06 0.12583304 0.09040002
## cg06173889 cg06173889 2.366083e-06 0.12583304 0.21664766
## cg17786642 cg17786642 5.481186e-06 0.25911150 0.13094473
## cg15444387 cg15444387 6.873199e-06 0.29242438 0.08297815
##Setting a threshold of 0.05DB, 5e-6 Nominal P (scale to 0.60DB):
makeVolcano_nominal(Age_Table$Nominal_P, Age_Table$Delta_Beta, 0.05, 5e-6, "DNAm changes", 0.3) #at 5e-6: 7 Hypermethylated, 0 Hypomethylated## [1] "Hypermethylated: 7"
## [1] "Hypomethylated: 0"
## Warning: Removed 236 rows containing missing values (geom_point).
#What are these hits?
#First make an annotated table - load 450K manifest.
load("~/KoborLab/kobor_space/shared_coding_resource/Illumina_EPIC/EPIC_Annotation_Complete.RData")
head(EPIC_Annotation_Complete)## Name AddressA_ID
## cg07881041 cg07881041 0085713262
## cg18478105 cg18478105 0046761277
## cg23229610 cg23229610 0021717843
## cg03513874 cg03513874 0029622133
## cg09835024 cg09835024 0016745152
## cg05451842 cg05451842 0016681196
## AlleleA_ProbeSeq AddressB_ID
## cg07881041 CTACAAATACAACACCCTCAACCCATATTTCATATATTATCTCATTTAAC
## cg18478105 AAATAAATTTCACTCTCAAATCCCAATCTCATACAACAAAACAAAAACCA 0086644198
## cg23229610 ATAAAATTCTTTCCTTAAAAAACATTAAAACCAAAATAAACAAAAATTCC
## cg03513874 ACAATAAAATAATAAAATCCCATCACTACTTACCCTCCTTAAATAATATC
## cg09835024 AATAAACACCAACCCCAAACCAATCTCACTTTATTAAATTACAAAAATCA 0081631976
## cg05451842 CRTTCAAATACACTATAACCCRACTAAAAAAACCCCCAACAACCCAAAAC
## AlleleB_ProbeSeq
## cg07881041
## cg18478105 AAATAAATTTCGCTCTCAAATCCCAATCTCGTACGACGAAACGAAAACCG
## cg23229610
## cg03513874
## cg09835024 AATAAACGCCGACCCCGAACCGATCTCGCTTTATTAAATTACAAAAATCG
## cg05451842
## Infinium_Design_Type Next_Base Color_Channel
## cg07881041 II
## cg18478105 I C Grn
## cg23229610 II
## cg03513874 II
## cg09835024 I A Red
## cg05451842 II
## Forward_Sequence
## cg07881041 CTGCACGCCTACTGCAGGTGCAGCACCCTCAGCCCATGTTTCATGTATTATCTCATTTAA[CG]CATGTATCATCTCATTTAATGCATGCATTATCTCATTTAATTCTCACAACCCCTCAGGTG
## cg18478105 TCCCGTCTTACGGGATGGATTTCGCTCTCAGGTCCCAGTCTCGTGCGGCGGGGCGGGGAC[CG]CAGCCGGCTGGGCGGGGAAGCCCTGAGCCGGGGAAGTCACGTGGGGCGTGTCCGGAGGCG
## cg23229610 GTTTCTGGACAGTAAAATTCTTTCCTTGAAGGACATTAGGGCCAAAATGGGCAAGGATTC[CG]AGATTGGTACATCGAGCGTTATCTTCCAACTCTCTTTTCTAAATGGGCTCATTTAGTAAT
## cg03513874 ATTGTGCCCACCTTGCTGCTGACAGTTAAGCATCACTAAAGTAGGAAATAGGGTCCAAAC[CG]ACACTACTTAAGGAGGGCAAGTAGTGATGGGACCTCATCATCCCATTGCTATCATGGAGC
## cg09835024 AGCCCCGTCATAGGTGGGCGCCGACCCCGAGCCGATCTCGCTTTATTAAATTACAGAAAT[CG]GTATTCAAAAAAAAAAAAAAAAAAGGGCGGGGAGGACACTCCCTCTTCTCTGTTCCCACA
## cg05451842 CACAGCGTGGATGCCCCGATTTCCCAGGTCCCTCCGCAACCCTCAGTAGAACTCCCACCG[CG]CCCTGGGCTGCTGGGGGCCTCCCCAGCCGGGTCACAGTGCACCTGAACGCCCCGGTCCGT
## Genome_Build CHR MAPINFO
## cg07881041 37 19 5236016
## cg18478105 37 20 61847650
## cg23229610 37 1 6841125
## cg03513874 37 2 198303466
## cg09835024 37 X 24072640
## cg05451842 37 14 93581139
## SourceSeq Strand
## cg07881041 TGCAGGTGCAGCACCCTCAGCCCATGTTTCATGTATTATCTCATTTAACG R
## cg18478105 CGGTCCCCGCCCCGCCGCACGAGACTGGGACCTGAGAGCGAAATCCATCC R
## cg23229610 CGGAATCCTTGCCCATTTTGGCCCTAATGTCCTTCAAGGAAAGAATTTTA R
## cg03513874 CAATGGGATGATGAGGTCCCATCACTACTTGCCCTCCTTAAGTAGTGTCG F
## cg09835024 GGTGGGCGCCGACCCCGAGCCGATCTCGCTTTATTAAATTACAGAAATCG R
## cg05451842 CGCCCTGGGCTGCTGGGGGCCTCCCCAGCCGGGTCACAGTGCACCTGAAC F
## UCSC_RefGene_Name UCSC_RefGene_Accession
## cg07881041 PTPRS;PTPRS;PTPRS;PTPRS NM_130855;NM_002850;NM_130854;NM_130853
## cg18478105 YTHDF1 NM_017798
## cg23229610
## cg03513874
## cg09835024 EIF2S3 NM_001415
## cg05451842 ITPK1;ITPK1;ITPK1 NM_001142593;NM_014216;NM_001142594
## UCSC_RefGene_Group UCSC_CpG_Islands_Name
## cg07881041 Body;Body;Body;Body chr19:5237294-5237669
## cg18478105 TSS200 chr20:61846843-61848103
## cg23229610 chr1:6844313-6846366
## cg03513874 chr2:198299244-198299972
## cg09835024 TSS1500 chrX:24072558-24073135
## cg05451842 Body;Body;Body chr14:93581083-93582797
## Relation_to_UCSC_CpG_Island Phantom4_Enhancers Phantom5_Enhancers
## cg07881041 N_Shore
## cg18478105 Island
## cg23229610 N_Shelf
## cg03513874 S_Shelf
## cg09835024 Island
## cg05451842 Island
## DMR X450k_Enhancer HMM_Island Regulatory_Feature_Name
## cg07881041 NA
## cg18478105 NA 20:61317142-61318498 20:61846284-61847956
## cg23229610 NA
## cg03513874 NA
## cg09835024 NA X:24071907-24073667
## cg05451842 NA 14:92650663-92652544
## Regulatory_Feature_Group GencodeBasicV12_NAME
## cg07881041
## cg18478105 Promoter_Associated YTHDF1;YTHDF1
## cg23229610
## cg03513874
## cg09835024 Promoter_Associated EIF2S3
## cg05451842 ITPK1
## GencodeBasicV12_Accession GencodeBasicV12_Group
## cg07881041
## cg18478105 ENST00000370334.4;ENST00000370339.3 TSS200;TSS200
## cg23229610
## cg03513874
## cg09835024 ENST00000253039.4 TSS200
## cg05451842 ENST00000555495.1 5'UTR
## GencodeCompV12_NAME
## cg07881041
## cg18478105 YTHDF1;YTHDF1
## cg23229610
## cg03513874
## cg09835024 EIF2S3;EIF2S3;EIF2S3
## cg05451842 ITPK1
## GencodeCompV12_Accession
## cg07881041
## cg18478105 ENST00000370334.4;ENST00000370339.3
## cg23229610
## cg03513874
## cg09835024 ENST00000487075.1;ENST00000423068.1;ENST00000253039.4
## cg05451842 ENST00000555495.1
## GencodeCompV12_Group DNase_Hypersensitivity_NAME
## cg07881041
## cg18478105 TSS200;TSS200 chr20:61847520-61847755
## cg23229610
## cg03513874
## cg09835024 TSS1500;TSS1500;TSS200 chrX:24072600-24073395
## cg05451842 5'UTR chr14:93581080-93581375
## DNase_Hypersensitivity_Evidence_Count OpenChromatin_NAME
## cg07881041 NA
## cg18478105 3
## cg23229610 NA
## cg03513874 NA
## cg09835024 3
## cg05451842 3
## OpenChromatin_Evidence_Count TFBS_NAME TFBS_Evidence_Count
## cg07881041 NA NA
## cg18478105 NA NA
## cg23229610 NA NA
## cg03513874 NA NA
## cg09835024 NA NA
## cg05451842 NA NA
## Methyl27_Loci Methyl450_Loci Chromosome_36 Coordinate_36
## cg07881041 NA TRUE 19 5187016
## cg18478105 NA TRUE 20 61318095
## cg23229610 NA TRUE 1 6763712
## cg03513874 NA TRUE 2 198011711
## cg09835024 NA TRUE X 23982561
## cg05451842 NA TRUE 14 92650892
## SNP_ID SNP_DISTANCE SNP_MinorAlleleFrequency
## cg07881041 rs187313142 18 0.000200
## cg18478105 rs549944121 5 0.001797
## cg23229610 rs545824288;rs527255711 40;12 0.000200;0.001198
## cg03513874
## cg09835024
## cg05451842 rs550745821 22 0.000200
## Random_Loci X strand Probe_rs Probe_maf CpG_rs CpG_maf SBE_rs
## cg07881041 NA NA - <NA> NA <NA> NA <NA>
## cg18478105 NA NA - <NA> NA <NA> NA <NA>
## cg23229610 NA NA - <NA> NA <NA> NA <NA>
## cg03513874 NA NA + <NA> NA <NA> NA <NA>
## cg09835024 NA NA - <NA> NA <NA> NA <NA>
## cg05451842 NA NA + <NA> NA <NA> NA <NA>
## SBE_maf CH_450_XY CH_450_Aut CH_EPIC Cross_Hyb
## cg07881041 NA No No No No
## cg18478105 NA No No No No
## cg23229610 NA No No No No
## cg03513874 NA No No No No
## cg09835024 NA No No No No
## cg05451842 NA No No No No
Age_Table.annotated = merge(Age_Table, EPIC_Annotation_Complete[,c("Name", "CHR", "Strand", "UCSC_RefGene_Name", "UCSC_RefGene_Group")], by.x = "CpG", by.y = "Name", all = FALSE)
colnames(Age_Table.annotated)[5:6] = c("Chromosome", "Coordinate")
Age_Table.annotated <- Age_Table.annotated[order(Age_Table.annotated$Nominal_P),]
#Grabbing the Volcano hits:
LM_Age_Hits <- Age_Table.annotated[which(abs(Age_Table.annotated$Delta_Beta)>0.05 & Age_Table.annotated$Nominal_P<5e-6),]
#Let's order by Nominal_P:
LM_Age_Hits = LM_Age_Hits[order(LM_Age_Hits$Nominal_P),]
rownames(LM_Age_Hits) = c()
str(LM_Age_Hits)## 'data.frame': 7 obs. of 8 variables:
## $ CpG : Factor w/ 425456 levels "cg00000029","cg00000108",..: 345056 112794 48814 344702 28944 367394 106386
## $ Nominal_P : num 9.78e-08 9.00e-07 1.15e-06 1.22e-06 1.93e-06 ...
## $ FDR : num 0.0416 0.1258 0.1258 0.1258 0.1258 ...
## $ Delta_Beta : num 0.0733 0.1929 0.1259 0.0648 0.1163 ...
## $ Chromosome : Factor w/ 25 levels "","1","10","11",..: 13 9 6 13 18 2 13
## $ Coordinate : Factor w/ 3 levels "","F","R": 3 3 2 2 3 3 3
## $ UCSC_RefGene_Name : Factor w/ 66070 levels "","A1BG","A1BG-AS1;A1BG",..: 30522 1 1 4760 1 45910 54913
## $ UCSC_RefGene_Group: Factor w/ 8044 levels "","1stExon","1stExon;1stExon",..: 5441 1 1 151 1 5441 1209
head(LM_Age_Hits)## CpG Nominal_P FDR Delta_Beta Chromosome Coordinate
## 1 cg22197050 9.782282e-08 0.04161931 0.07334226 2 R
## 2 cg06596654 9.002000e-07 0.12583304 0.19294123 16 R
## 3 cg02727104 1.154277e-06 0.12583304 0.12586465 13 F
## 4 cg22166290 1.224729e-06 0.12583304 0.06480999 2 F
## 5 cg01594233 1.928795e-06 0.12583304 0.11625353 4 R
## 6 cg23813012 2.248637e-06 0.12583304 0.09040002 1 R
## UCSC_RefGene_Name
## 1 LOC100132215
## 2
## 3
## 4 BCL11A;BCL11A;BCL11A;BCL11A;BCL11A;BCL11A
## 5
## 6 PRDM2
## UCSC_RefGene_Group
## 1 TSS1500
## 2
## 3
## 4 1stExon;1stExon;1stExon;5'UTR;5'UTR;5'UTR
## 5
## 6 TSS1500
Age-associated hits: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3482848/ Non-tissue specific hits: https://epigeneticsandchromatin.biomedcentral.com/articles/10.1186/s13072-018-0191-3
#Reorder row index.
rownames(Age_Table.annotated) <- NULL
#Look at hits based on candidate genes.
head(Age_Table.annotated[which(Age_Table.annotated$UCSC_RefGene_Name == "ELOVL2"),]) #First ELOVL2 hit == row 14999.## CpG Nominal_P FDR Delta_Beta Chromosome Coordinate
## 5818 cg24724428 0.01306187 0.8679507 0.13816266 6 F
## 10597 cg16867657 0.02498331 0.9073335 0.12001180 6 F
## 22215 cg01799681 0.05528434 0.9629861 -0.06722849 6 F
## 40061 cg21572722 0.10147615 0.9880938 0.06130403 6 F
## 52950 cg13562911 0.13527099 0.9993493 0.01746916 6 R
## 128106 cg25151806 0.32871854 0.9999614 0.01135130 6 R
## UCSC_RefGene_Name UCSC_RefGene_Group
## 5818 ELOVL2 TSS1500
## 10597 ELOVL2 TSS1500
## 22215 ELOVL2 Body
## 40061 ELOVL2 TSS1500
## 52950 ELOVL2 Body
## 128106 ELOVL2 TSS1500
head(Age_Table.annotated[which(Age_Table.annotated$UCSC_RefGene_Name == "EDARADD"),]) #First EDARADD hit == row 198718.## CpG Nominal_P FDR Delta_Beta Chromosome Coordinate
## 87169 cg18964582 0.223704 0.9999614 0.01332839 1 R
## UCSC_RefGene_Name UCSC_RefGene_Group
## 87169 EDARADD TSS1500
head(Age_Table.annotated[which(Age_Table.annotated$UCSC_RefGene_Name == "TOM1L1"),]) #First TOM1L1 hit == row 6147.## CpG Nominal_P FDR Delta_Beta Chromosome Coordinate
## 389 cg25431220 0.000539516 0.5571367 0.02272728 17 F
## 4659 cg07081054 0.010129245 0.8435414 0.05077271 17 R
## 17992 cg03870845 0.044346041 0.9501010 0.01825487 17 F
## 61481 cg10237252 0.157455931 0.9999614 0.08567139 17 R
## 93820 cg05265484 0.240881548 0.9999614 0.06522953 17 F
## 97185 cg12240603 0.249676189 0.9999614 0.02272064 17 F
## UCSC_RefGene_Name UCSC_RefGene_Group
## 389 TOM1L1 TSS200
## 4659 TOM1L1 Body
## 17992 TOM1L1 TSS200
## 61481 TOM1L1 TSS1500
## 93820 TOM1L1 TSS1500
## 97185 TOM1L1 TSS1500
head(Age_Table.annotated[which(Age_Table.annotated$UCSC_RefGene_Name == "NPTX2"),]) #First NPTX2 hit == row 18625.## CpG Nominal_P FDR Delta_Beta Chromosome Coordinate
## 8960 cg13878520 0.02083867 0.8954641 0.03776219 7 R
## 30407 cg08315202 0.07634766 0.9770872 0.04132384 7 R
## 31888 cg13314145 0.08037648 0.9807878 0.02325648 7 R
## 66889 cg13695954 0.17144570 0.9999614 0.01185541 7 F
## 71619 cg13585675 0.18350664 0.9999614 0.03665194 7 F
## 85391 cg17278447 0.21901598 0.9999614 0.04496371 7 R
## UCSC_RefGene_Name UCSC_RefGene_Group
## 8960 NPTX2 Body
## 30407 NPTX2 TSS1500
## 31888 NPTX2 TSS1500
## 66889 NPTX2 Body
## 71619 NPTX2 Body
## 85391 NPTX2 Body
Only SV (cell type) as covariate.
#Load all the objects.
load("~/KoborLab/kobor_space/kendrix/macular_degeneration/data/AMD_SVA_M_values.funnorm.RData")
dim(AMD_pData) #44 samples.## [1] 44 22
dim(M_values.funnorm.filt) #425456 probes.## [1] 425456 44
AMD_pData$Disease_State <- as.factor(AMD_pData$Disease_State)
AMD_pData$Sex <- as.factor(AMD_pData$Sex)
AMD_pData$Row <- as.factor(AMD_pData$Row)
AMD_pData$Chip <- as.factor(AMD_pData$Chip)
AMD_pData$Age <- as.numeric(AMD_pData$Age)
str(AMD_pData)## 'data.frame': 44 obs. of 22 variables:
## $ Sample_Name : chr "Sample 1" "Sample 10" "Sample 11" "Sample 12" ...
## $ Disease_State: Factor w/ 2 levels "age-related macular degeneration",..: 2 2 2 1 1 1 2 1 1 1 ...
## $ Sex : Factor w/ 2 levels "F","M": 2 2 2 2 2 1 2 2 1 2 ...
## $ Age : num 61 74 70 76 79 89 66 70 83 76 ...
## $ Tissue : chr "pigmented layer of retina" "pigmented layer of retina" "pigmented layer of retina" "pigmented layer of retina" ...
## $ Row : Factor w/ 12 levels "R01C01","R01C02",..: 3 10 12 1 3 5 7 9 11 2 ...
## $ Chip : Factor w/ 4 levels "200723300084",..: 2 2 2 4 4 4 4 4 4 4 ...
## $ Basename : chr "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200723300089_R02C01" "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200723300089_R05C02" "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200723300089_R06C02" "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200770460039_R01C01" ...
## $ filenames : chr "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200723300089_R02C01" "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200723300089_R05C02" "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200723300089_R06C02" "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200770460039_R01C01" ...
## $ xMed : num 11.4 11.7 11.9 11.7 11.6 ...
## $ yMed : num 11.7 12 12.1 11.9 11.9 ...
## $ predictedSex : chr "M" "M" "M" "M" ...
## $ SV : num -0.1222 -0.011 -0.1074 -0.073 -0.0265 ...
## $ Epithelial : num 0.387 0.436 0.367 0.413 0.387 ...
## $ Fibroblast : num 0.326 0.316 0.298 0.24 0.312 ...
## $ B_Cell : num 0.0502 0.0454 0.0582 0.0528 0.0482 ...
## $ NK_Cell : num 0.065 0.0601 0.0816 0.0986 0.0743 ...
## $ CD4T : num 0.0478 0.0467 0.0545 0.0658 0.0544 ...
## $ CD8T : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Monocyte : num 0.0685 0.05 0.081 0.0666 0.0651 ...
## $ Neutrophil : num 0 0 0 0 0 ...
## $ Eosinophil : num 0.0555 0.0457 0.0597 0.0632 0.0597 ...
#Check order.
identical(rownames(AMD_pData), colnames(M_values.funnorm.filt)) #TRUE. ## [1] TRUE
#Sanity check - there should be no NAs or infinite numbers - which could be a result of logit transformation of 0 or 1 beta values.
all(complete.cases(M_values.funnorm.filt)) == "TRUE" #TRUE - meaning no NA or infinite numbers. ## [1] TRUE
library(pbapply) #Progress bar for apply functions.
#EWAS on Age - All samples + SV.
#LM: Need to use transformed M-values instead of beta values as it is more statistically sound.
Age.SV_LM_pval <- pbsapply(1:nrow(M_values.funnorm.filt), function(CpG){
meta <- AMD_pData
meta$Mval <- M_values.funnorm.filt[CpG,]
mod_Age.SV <- lm(Mval ~ Age + SV, data = meta) #Only SV as covariate.
coef(summary(mod_Age.SV))[2,4]}) #Returns nominal p-value for Age for model at each CpG.
head(Age.SV_LM_pval)
save(Age.SV_LM_pval, file = "~/KoborLab/kobor_space/kendrix/macular_degeneration/data/Age.SV_LM_pval.RData")load("~/KoborLab/kobor_space/kendrix/macular_degeneration/data/Age.SV_LM_pval.RData")
#Inspect p-value distribution for model.
pvalue_dist_Age.SV <- data.frame(CpG = rownames(M_values.funnorm.filt), Nominal_P = Age.SV_LM_pval)
ggplot(pvalue_dist_Age.SV, aes(Nominal_P)) +
geom_histogram(fill = "grey90", color = "black") +
theme_classic() + xlab("Nominal P Value") +
ylim(0, 20000) +
xlim(min(Age.SV_LM_pval), max(Age.SV_LM_pval))## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 3 rows containing missing values (geom_bar).
#Not right-skewed. Distribution is a little even.
#Multiple test correction with FDR.
M_values.funnorm.filt <- as.data.frame(M_values.funnorm.filt)
Multi_test_corr_relaxed <- p.adjust(Age.SV_LM_pval, method = "fdr", n = length(Age.SV_LM_pval))
#Looking at FDR thresholds for hits:
dim(as.data.frame(M_values.funnorm.filt)[which(Multi_test_corr_relaxed <= 0.05),]) #5 at 0.05.## [1] 5 44
dim(as.data.frame(M_values.funnorm.filt)[which(Multi_test_corr_relaxed <= 0.1),]) #18 at 0.1.## [1] 18 44
dim(as.data.frame(M_values.funnorm.filt)[which(Multi_test_corr_relaxed <= 0.2),]) #133 at 0.2.## [1] 133 44
#Looking at top hits by nominal P:
pvalue_dist_Age.SV <- pvalue_dist_Age.SV[order(pvalue_dist_Age.SV$Nominal_P),]
head(pvalue_dist_Age.SV)## CpG Nominal_P
## 51300 cg22197050 7.223296e-08
## 100675 cg25359907 1.752215e-07
## 347431 cg06596654 2.082010e-07
## 49545 cg21875802 2.906234e-07
## 291484 cg05037876 3.405108e-07
## 295662 cg02727104 1.053368e-06
#Load 450K annotation data.
load("~/KoborLab/kobor_space/shared_coding_resource/Illumina_EPIC/EPIC_Annotation_Complete.RData")
head(EPIC_Annotation_Complete)## Name AddressA_ID
## cg07881041 cg07881041 0085713262
## cg18478105 cg18478105 0046761277
## cg23229610 cg23229610 0021717843
## cg03513874 cg03513874 0029622133
## cg09835024 cg09835024 0016745152
## cg05451842 cg05451842 0016681196
## AlleleA_ProbeSeq AddressB_ID
## cg07881041 CTACAAATACAACACCCTCAACCCATATTTCATATATTATCTCATTTAAC
## cg18478105 AAATAAATTTCACTCTCAAATCCCAATCTCATACAACAAAACAAAAACCA 0086644198
## cg23229610 ATAAAATTCTTTCCTTAAAAAACATTAAAACCAAAATAAACAAAAATTCC
## cg03513874 ACAATAAAATAATAAAATCCCATCACTACTTACCCTCCTTAAATAATATC
## cg09835024 AATAAACACCAACCCCAAACCAATCTCACTTTATTAAATTACAAAAATCA 0081631976
## cg05451842 CRTTCAAATACACTATAACCCRACTAAAAAAACCCCCAACAACCCAAAAC
## AlleleB_ProbeSeq
## cg07881041
## cg18478105 AAATAAATTTCGCTCTCAAATCCCAATCTCGTACGACGAAACGAAAACCG
## cg23229610
## cg03513874
## cg09835024 AATAAACGCCGACCCCGAACCGATCTCGCTTTATTAAATTACAAAAATCG
## cg05451842
## Infinium_Design_Type Next_Base Color_Channel
## cg07881041 II
## cg18478105 I C Grn
## cg23229610 II
## cg03513874 II
## cg09835024 I A Red
## cg05451842 II
## Forward_Sequence
## cg07881041 CTGCACGCCTACTGCAGGTGCAGCACCCTCAGCCCATGTTTCATGTATTATCTCATTTAA[CG]CATGTATCATCTCATTTAATGCATGCATTATCTCATTTAATTCTCACAACCCCTCAGGTG
## cg18478105 TCCCGTCTTACGGGATGGATTTCGCTCTCAGGTCCCAGTCTCGTGCGGCGGGGCGGGGAC[CG]CAGCCGGCTGGGCGGGGAAGCCCTGAGCCGGGGAAGTCACGTGGGGCGTGTCCGGAGGCG
## cg23229610 GTTTCTGGACAGTAAAATTCTTTCCTTGAAGGACATTAGGGCCAAAATGGGCAAGGATTC[CG]AGATTGGTACATCGAGCGTTATCTTCCAACTCTCTTTTCTAAATGGGCTCATTTAGTAAT
## cg03513874 ATTGTGCCCACCTTGCTGCTGACAGTTAAGCATCACTAAAGTAGGAAATAGGGTCCAAAC[CG]ACACTACTTAAGGAGGGCAAGTAGTGATGGGACCTCATCATCCCATTGCTATCATGGAGC
## cg09835024 AGCCCCGTCATAGGTGGGCGCCGACCCCGAGCCGATCTCGCTTTATTAAATTACAGAAAT[CG]GTATTCAAAAAAAAAAAAAAAAAAGGGCGGGGAGGACACTCCCTCTTCTCTGTTCCCACA
## cg05451842 CACAGCGTGGATGCCCCGATTTCCCAGGTCCCTCCGCAACCCTCAGTAGAACTCCCACCG[CG]CCCTGGGCTGCTGGGGGCCTCCCCAGCCGGGTCACAGTGCACCTGAACGCCCCGGTCCGT
## Genome_Build CHR MAPINFO
## cg07881041 37 19 5236016
## cg18478105 37 20 61847650
## cg23229610 37 1 6841125
## cg03513874 37 2 198303466
## cg09835024 37 X 24072640
## cg05451842 37 14 93581139
## SourceSeq Strand
## cg07881041 TGCAGGTGCAGCACCCTCAGCCCATGTTTCATGTATTATCTCATTTAACG R
## cg18478105 CGGTCCCCGCCCCGCCGCACGAGACTGGGACCTGAGAGCGAAATCCATCC R
## cg23229610 CGGAATCCTTGCCCATTTTGGCCCTAATGTCCTTCAAGGAAAGAATTTTA R
## cg03513874 CAATGGGATGATGAGGTCCCATCACTACTTGCCCTCCTTAAGTAGTGTCG F
## cg09835024 GGTGGGCGCCGACCCCGAGCCGATCTCGCTTTATTAAATTACAGAAATCG R
## cg05451842 CGCCCTGGGCTGCTGGGGGCCTCCCCAGCCGGGTCACAGTGCACCTGAAC F
## UCSC_RefGene_Name UCSC_RefGene_Accession
## cg07881041 PTPRS;PTPRS;PTPRS;PTPRS NM_130855;NM_002850;NM_130854;NM_130853
## cg18478105 YTHDF1 NM_017798
## cg23229610
## cg03513874
## cg09835024 EIF2S3 NM_001415
## cg05451842 ITPK1;ITPK1;ITPK1 NM_001142593;NM_014216;NM_001142594
## UCSC_RefGene_Group UCSC_CpG_Islands_Name
## cg07881041 Body;Body;Body;Body chr19:5237294-5237669
## cg18478105 TSS200 chr20:61846843-61848103
## cg23229610 chr1:6844313-6846366
## cg03513874 chr2:198299244-198299972
## cg09835024 TSS1500 chrX:24072558-24073135
## cg05451842 Body;Body;Body chr14:93581083-93582797
## Relation_to_UCSC_CpG_Island Phantom4_Enhancers Phantom5_Enhancers
## cg07881041 N_Shore
## cg18478105 Island
## cg23229610 N_Shelf
## cg03513874 S_Shelf
## cg09835024 Island
## cg05451842 Island
## DMR X450k_Enhancer HMM_Island Regulatory_Feature_Name
## cg07881041 NA
## cg18478105 NA 20:61317142-61318498 20:61846284-61847956
## cg23229610 NA
## cg03513874 NA
## cg09835024 NA X:24071907-24073667
## cg05451842 NA 14:92650663-92652544
## Regulatory_Feature_Group GencodeBasicV12_NAME
## cg07881041
## cg18478105 Promoter_Associated YTHDF1;YTHDF1
## cg23229610
## cg03513874
## cg09835024 Promoter_Associated EIF2S3
## cg05451842 ITPK1
## GencodeBasicV12_Accession GencodeBasicV12_Group
## cg07881041
## cg18478105 ENST00000370334.4;ENST00000370339.3 TSS200;TSS200
## cg23229610
## cg03513874
## cg09835024 ENST00000253039.4 TSS200
## cg05451842 ENST00000555495.1 5'UTR
## GencodeCompV12_NAME
## cg07881041
## cg18478105 YTHDF1;YTHDF1
## cg23229610
## cg03513874
## cg09835024 EIF2S3;EIF2S3;EIF2S3
## cg05451842 ITPK1
## GencodeCompV12_Accession
## cg07881041
## cg18478105 ENST00000370334.4;ENST00000370339.3
## cg23229610
## cg03513874
## cg09835024 ENST00000487075.1;ENST00000423068.1;ENST00000253039.4
## cg05451842 ENST00000555495.1
## GencodeCompV12_Group DNase_Hypersensitivity_NAME
## cg07881041
## cg18478105 TSS200;TSS200 chr20:61847520-61847755
## cg23229610
## cg03513874
## cg09835024 TSS1500;TSS1500;TSS200 chrX:24072600-24073395
## cg05451842 5'UTR chr14:93581080-93581375
## DNase_Hypersensitivity_Evidence_Count OpenChromatin_NAME
## cg07881041 NA
## cg18478105 3
## cg23229610 NA
## cg03513874 NA
## cg09835024 3
## cg05451842 3
## OpenChromatin_Evidence_Count TFBS_NAME TFBS_Evidence_Count
## cg07881041 NA NA
## cg18478105 NA NA
## cg23229610 NA NA
## cg03513874 NA NA
## cg09835024 NA NA
## cg05451842 NA NA
## Methyl27_Loci Methyl450_Loci Chromosome_36 Coordinate_36
## cg07881041 NA TRUE 19 5187016
## cg18478105 NA TRUE 20 61318095
## cg23229610 NA TRUE 1 6763712
## cg03513874 NA TRUE 2 198011711
## cg09835024 NA TRUE X 23982561
## cg05451842 NA TRUE 14 92650892
## SNP_ID SNP_DISTANCE SNP_MinorAlleleFrequency
## cg07881041 rs187313142 18 0.000200
## cg18478105 rs549944121 5 0.001797
## cg23229610 rs545824288;rs527255711 40;12 0.000200;0.001198
## cg03513874
## cg09835024
## cg05451842 rs550745821 22 0.000200
## Random_Loci X strand Probe_rs Probe_maf CpG_rs CpG_maf SBE_rs
## cg07881041 NA NA - <NA> NA <NA> NA <NA>
## cg18478105 NA NA - <NA> NA <NA> NA <NA>
## cg23229610 NA NA - <NA> NA <NA> NA <NA>
## cg03513874 NA NA + <NA> NA <NA> NA <NA>
## cg09835024 NA NA - <NA> NA <NA> NA <NA>
## cg05451842 NA NA + <NA> NA <NA> NA <NA>
## SBE_maf CH_450_XY CH_450_Aut CH_EPIC Cross_Hyb
## cg07881041 NA No No No No
## cg18478105 NA No No No No
## cg23229610 NA No No No No
## cg03513874 NA No No No No
## cg09835024 NA No No No No
## cg05451842 NA No No No No
dim(hits_CpGs <- pvalue_dist_Age.SV[which(pvalue_dist_Age.SV$Nominal_P < 1e-6),]) #5 hits.## [1] 5 2
hits <- EPIC_Annotation_Complete[which(EPIC_Annotation_Complete$Name%in%hits_CpGs$CpG),]
hits$UCSC_RefGene_Name## [1] LOC84740 LOC100132215 SFRS8
## 66070 Levels: A1BG A1BG-AS1;A1BG A1BG-AS1;A1BG;ZNF497;ZNF497 ... ZZZ3;ZZZ3;ZZZ3
#Delta beta.
#Using Maggie's code for deltabeta:
deltabeta <- function(df, mainvar, covar1 = NULL, covar2 = NULL, covar3 = NULL, covar4 = NULL, covar5 = NULL) {
# Calculating delta beta of the main variable of interest (mainvar), with up to 5 possible covariates (covar)
# mainvar should be a vector of continuous variable
# all covars should also be vectors
# df = dataframe or matrix of beta values
# output is a vector of delta beta values
sd=sd(mainvar)
qt <-
range <- max(mainvar, na.rm = T) - min(mainvar, na.rm = T)
dB <- vector(mode = "numeric", length = nrow(df))
names(dB) <- rownames(df)
for (i in 1:nrow(df)) {
beta <- df[i, ]
if (is.null(covar1)) {
mod <- lm(beta ~ mainvar)
} else if (is.null(covar2)) {
mod <- lm(beta ~ mainvar + covar1)
} else if (is.null(covar3)) {
mod <- lm(beta ~ mainvar + covar1 + covar2)
} else if (is.null(covar4)) {
mod <- lm(beta ~ mainvar + covar1 + covar2 + covar3)
} else if (is.null(covar5)) {
mod <- lm(beta ~ mainvar + covar1 + covar2 + covar3 + covar4)
} else {
mod <- lm(beta ~ mainvar + covar1 + covar2 + covar3 + covar4 + covar5)
}
slope <- mod$coefficients[2]
dB[i] <- as.numeric(slope*range)
}
dB
}
betas.funnorm.filt <- m2beta(M_values.funnorm.filt)
delta_beta_Age.SV_fixed <- deltabeta(as.matrix(betas.funnorm.filt), AMD_pData$Age, covar1 = AMD_pData$SV)
length(delta_beta_Age.SV_fixed)
summary(delta_beta_Age.SV_fixed)
save(delta_beta_Age.SV_fixed, file = "~/KoborLab/kobor_space/kendrix/macular_degeneration/data/DB_Age.SV_fixed.RData")##3. Linear Model: Volcano Plot
#Volcano to examine hits (for DB, see below chunks):
load("~/KoborLab/kobor_space/kendrix/macular_degeneration/data/DB_Age.SV_fixed.RData")
#Call Volcano (Nominal p Version, modified from Rachel's code):
source("/home/BCRICWH.LAN/dlin/KoborLab/kobor_space/cake/home/dlin/Volcano_DL_Nominal.R")
#After running the last 2 chunks, make a summary table with CpG, Nominal_P, FDR, and Delta Beta.
Age.SV_Table <- data.frame(rownames(M_values.funnorm.filt), Age.SV_LM_pval, Multi_test_corr_relaxed, delta_beta_Age.SV_fixed)
colnames(Age.SV_Table) = c("CpG", "Nominal_P", "FDR", "Delta_Beta")
identical(as.character(rownames(Age.SV_Table)), as.character(Age.SV_Table$CpG)) #TRUE.## [1] TRUE
#Looking at top hits quickly without considering DB:
head(Age.SV_Table[order(Age.SV_Table$Nominal_P),],10)## CpG Nominal_P FDR Delta_Beta
## cg22197050 cg22197050 7.223296e-08 0.02897447 0.07358795
## cg25359907 cg25359907 1.752215e-07 0.02897447 0.04685236
## cg06596654 cg06596654 2.082010e-07 0.02897447 0.19414626
## cg21875802 cg21875802 2.906234e-07 0.02897447 0.15773330
## cg05037876 cg05037876 3.405108e-07 0.02897447 0.05090909
## cg02727104 cg02727104 1.053368e-06 0.06004712 0.12536605
## cg10074727 cg10074727 1.230206e-06 0.06004712 0.12122329
## cg09011833 cg09011833 1.471122e-06 0.06004712 0.04802012
## cg22166290 cg22166290 1.486331e-06 0.06004712 0.06491800
## cg23813012 cg23813012 1.659727e-06 0.06004712 0.08996610
##Setting a threshold of 0.05DB, 5e-6 Nominal P (scale to 0.60DB):
makeVolcano_nominal(Age.SV_Table$Nominal_P, Age.SV_Table$Delta_Beta, 0.05, 5e-6, "DNAm changes", 0.3) #at 5e-6: 17 Hypermethylated, 1 Hypomethylated## [1] "Hypermethylated: 17"
## [1] "Hypomethylated: 1"
## Warning: Removed 230 rows containing missing values (geom_point).
#What are these hits?
#First make an annotated table - load 450K manifest.
load("/home/BCRICWH.LAN/ngladish/KoborLab/kobor_space/shared_coding_resource/Illumina_EPIC/EPIC_Annotation_Complete.RData")
colnames(EPIC_Annotation_Complete)## [1] "Name"
## [2] "AddressA_ID"
## [3] "AlleleA_ProbeSeq"
## [4] "AddressB_ID"
## [5] "AlleleB_ProbeSeq"
## [6] "Infinium_Design_Type"
## [7] "Next_Base"
## [8] "Color_Channel"
## [9] "Forward_Sequence"
## [10] "Genome_Build"
## [11] "CHR"
## [12] "MAPINFO"
## [13] "SourceSeq"
## [14] "Strand"
## [15] "UCSC_RefGene_Name"
## [16] "UCSC_RefGene_Accession"
## [17] "UCSC_RefGene_Group"
## [18] "UCSC_CpG_Islands_Name"
## [19] "Relation_to_UCSC_CpG_Island"
## [20] "Phantom4_Enhancers"
## [21] "Phantom5_Enhancers"
## [22] "DMR"
## [23] "X450k_Enhancer"
## [24] "HMM_Island"
## [25] "Regulatory_Feature_Name"
## [26] "Regulatory_Feature_Group"
## [27] "GencodeBasicV12_NAME"
## [28] "GencodeBasicV12_Accession"
## [29] "GencodeBasicV12_Group"
## [30] "GencodeCompV12_NAME"
## [31] "GencodeCompV12_Accession"
## [32] "GencodeCompV12_Group"
## [33] "DNase_Hypersensitivity_NAME"
## [34] "DNase_Hypersensitivity_Evidence_Count"
## [35] "OpenChromatin_NAME"
## [36] "OpenChromatin_Evidence_Count"
## [37] "TFBS_NAME"
## [38] "TFBS_Evidence_Count"
## [39] "Methyl27_Loci"
## [40] "Methyl450_Loci"
## [41] "Chromosome_36"
## [42] "Coordinate_36"
## [43] "SNP_ID"
## [44] "SNP_DISTANCE"
## [45] "SNP_MinorAlleleFrequency"
## [46] "Random_Loci"
## [47] "X"
## [48] "strand"
## [49] "Probe_rs"
## [50] "Probe_maf"
## [51] "CpG_rs"
## [52] "CpG_maf"
## [53] "SBE_rs"
## [54] "SBE_maf"
## [55] "CH_450_XY"
## [56] "CH_450_Aut"
## [57] "CH_EPIC"
## [58] "Cross_Hyb"
Age.SV_Table.annotated = merge(Age.SV_Table, EPIC_Annotation_Complete[,c("Name", "CHR", "Strand", "UCSC_RefGene_Name", "UCSC_RefGene_Group")], by.x = "CpG", by.y = "Name", all = FALSE)
colnames(Age.SV_Table.annotated)[5:6] = c("Chromosome", "Coordinate")
Age.SV_Table.annotated <- Age.SV_Table.annotated[order(Age.SV_Table.annotated$Nominal_P),]
#Grabbing the Volcano hits:
LM_Age.SV_Hits <- Age.SV_Table.annotated[which(abs(Age.SV_Table.annotated$Delta_Beta)>0.05 & Age.SV_Table.annotated$Nominal_P<5e-6),]
#Let's order by Nominal_P:
LM_Age.SV_Hits = LM_Age.SV_Hits[order(LM_Age.SV_Hits$Nominal_P),]
rownames(LM_Age.SV_Hits) = c()
str(LM_Age.SV_Hits)## 'data.frame': 17 obs. of 8 variables:
## $ CpG : Factor w/ 425456 levels "cg00000029","cg00000108",..: 345056 112794 340924 88163 48814 168455 344702 367394 106386 28944 ...
## $ Nominal_P : num 7.22e-08 2.08e-07 2.91e-07 3.41e-07 1.05e-06 ...
## $ FDR : num 0.029 0.029 0.029 0.029 0.06 ...
## $ Delta_Beta : num 0.0736 0.1941 0.1577 0.0509 0.1254 ...
## $ Chromosome : Factor w/ 25 levels "","1","10","11",..: 13 9 13 5 6 20 13 2 13 18 ...
## $ Coordinate : Factor w/ 3 levels "","F","R": 3 3 3 2 2 3 2 3 3 3 ...
## $ UCSC_RefGene_Name : Factor w/ 66070 levels "","A1BG","A1BG-AS1;A1BG",..: 30522 1 1 51815 1 21228 4760 45910 54913 1 ...
## $ UCSC_RefGene_Group: Factor w/ 8044 levels "","1stExon","1stExon;1stExon",..: 5441 1 1 3719 1 5441 151 5441 1209 1 ...
head(LM_Age.SV_Hits)## CpG Nominal_P FDR Delta_Beta Chromosome Coordinate
## 1 cg22197050 7.223296e-08 0.02897447 0.07358795 2 R
## 2 cg06596654 2.082010e-07 0.02897447 0.19414626 16 R
## 3 cg21875802 2.906234e-07 0.02897447 0.15773330 2 R
## 4 cg05037876 3.405108e-07 0.02897447 0.05090909 12 F
## 5 cg02727104 1.053368e-06 0.06004712 0.12536605 13 F
## 6 cg10074727 1.230206e-06 0.06004712 0.12122329 6 R
## UCSC_RefGene_Name UCSC_RefGene_Group
## 1 LOC100132215 TSS1500
## 2
## 3
## 4 SFRS8 Body
## 5
## 6 GCM2 TSS1500
Age-associated hits: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3482848/ Non-tissue specific hits: https://epigeneticsandchromatin.biomedcentral.com/articles/10.1186/s13072-018-0191-3
#Reorder row index.
rownames(Age.SV_Table.annotated) <- NULL
#Look at hits based on candidate genes.
head(Age.SV_Table.annotated[which(Age.SV_Table.annotated$UCSC_RefGene_Name == "ELOVL2"),]) #First ELOVL2 hit == row 78.## CpG Nominal_P FDR Delta_Beta Chromosome Coordinate
## 72 cg16867657 3.463246e-05 0.1809725 0.12392410 6 F
## 396 cg24724428 2.750514e-04 0.2754662 0.14106900 6 F
## 4137 cg21572722 4.903703e-03 0.4655403 0.06381768 6 F
## 16853 cg01799681 2.590172e-02 0.5940401 -0.06551984 6 F
## 72273 cg13562911 1.398169e-01 0.7592642 0.01746062 6 R
## 143610 cg25151806 3.067914e-01 0.8427731 0.01099010 6 R
## UCSC_RefGene_Name UCSC_RefGene_Group
## 72 ELOVL2 TSS1500
## 396 ELOVL2 TSS1500
## 4137 ELOVL2 TSS1500
## 16853 ELOVL2 Body
## 72273 ELOVL2 Body
## 143610 ELOVL2 TSS1500
head(Age.SV_Table.annotated[which(Age.SV_Table.annotated$UCSC_RefGene_Name == "EDARADD"),]) #First EDARADD hit == row 92628.## CpG Nominal_P FDR Delta_Beta Chromosome Coordinate
## 85570 cg18964582 0.1697889 0.7798321 0.01286814 1 R
## UCSC_RefGene_Name UCSC_RefGene_Group
## 85570 EDARADD TSS1500
head(Age.SV_Table.annotated[which(Age.SV_Table.annotated$UCSC_RefGene_Name == "TOM1L1"),]) #First TOM1L1 hit == row 836.## CpG Nominal_P FDR Delta_Beta Chromosome Coordinate
## 778 cg25431220 0.000621525 0.3161219 0.02266178 17 F
## 6860 cg07081054 0.008901437 0.5071876 0.05113179 17 R
## 28131 cg03870845 0.047096992 0.6500937 0.01823156 17 F
## 48904 cg05265484 0.088672703 0.7092635 0.06107258 17 F
## 77772 cg10237252 0.152027983 0.7676020 0.08424283 17 R
## 99535 cg12240603 0.201335729 0.7957406 0.02151003 17 F
## UCSC_RefGene_Name UCSC_RefGene_Group
## 778 TOM1L1 TSS200
## 6860 TOM1L1 Body
## 28131 TOM1L1 TSS200
## 48904 TOM1L1 TSS1500
## 77772 TOM1L1 TSS1500
## 99535 TOM1L1 TSS1500
head(Age.SV_Table.annotated[which(Age.SV_Table.annotated$UCSC_RefGene_Name == "NPTX2"),]) #First NPTX2 hit == row 3231.## CpG Nominal_P FDR Delta_Beta Chromosome Coordinate
## 3006 cg02368096 0.003344363 0.4403836 0.07417906 7 R
## 10763 cg13878520 0.015165880 0.5479406 0.03723864 7 R
## 40150 cg08315202 0.070754743 0.6875276 0.04161135 7 R
## 44194 cg13314145 0.078951478 0.6980998 0.02343091 7 R
## 52529 cg13695954 0.096517852 0.7192038 0.01227917 7 F
## 55615 cg13585675 0.103227394 0.7268572 0.03538950 7 F
## UCSC_RefGene_Name UCSC_RefGene_Group
## 3006 NPTX2 Body
## 10763 NPTX2 Body
## 40150 NPTX2 TSS1500
## 44194 NPTX2 TSS1500
## 52529 NPTX2 Body
## 55615 NPTX2 Body
SV (cell type) and chip as covariates.
#Load all the objects.
load("~/KoborLab/kobor_space/kendrix/macular_degeneration/data/AMD_SVA_M_values.funnorm.RData")
dim(AMD_pData) #44 samples.## [1] 44 22
dim(M_values.funnorm.filt) #425456 probes.## [1] 425456 44
AMD_pData$Disease_State <- as.factor(AMD_pData$Disease_State)
AMD_pData$Sex <- as.factor(AMD_pData$Sex)
AMD_pData$Row <- as.factor(AMD_pData$Row)
AMD_pData$Chip <- as.factor(AMD_pData$Chip)
AMD_pData$Age <- as.numeric(AMD_pData$Age)
str(AMD_pData)## 'data.frame': 44 obs. of 22 variables:
## $ Sample_Name : chr "Sample 1" "Sample 10" "Sample 11" "Sample 12" ...
## $ Disease_State: Factor w/ 2 levels "age-related macular degeneration",..: 2 2 2 1 1 1 2 1 1 1 ...
## $ Sex : Factor w/ 2 levels "F","M": 2 2 2 2 2 1 2 2 1 2 ...
## $ Age : num 61 74 70 76 79 89 66 70 83 76 ...
## $ Tissue : chr "pigmented layer of retina" "pigmented layer of retina" "pigmented layer of retina" "pigmented layer of retina" ...
## $ Row : Factor w/ 12 levels "R01C01","R01C02",..: 3 10 12 1 3 5 7 9 11 2 ...
## $ Chip : Factor w/ 4 levels "200723300084",..: 2 2 2 4 4 4 4 4 4 4 ...
## $ Basename : chr "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200723300089_R02C01" "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200723300089_R05C02" "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200723300089_R06C02" "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200770460039_R01C01" ...
## $ filenames : chr "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200723300089_R02C01" "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200723300089_R05C02" "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200723300089_R06C02" "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200770460039_R01C01" ...
## $ xMed : num 11.4 11.7 11.9 11.7 11.6 ...
## $ yMed : num 11.7 12 12.1 11.9 11.9 ...
## $ predictedSex : chr "M" "M" "M" "M" ...
## $ SV : num -0.1222 -0.011 -0.1074 -0.073 -0.0265 ...
## $ Epithelial : num 0.387 0.436 0.367 0.413 0.387 ...
## $ Fibroblast : num 0.326 0.316 0.298 0.24 0.312 ...
## $ B_Cell : num 0.0502 0.0454 0.0582 0.0528 0.0482 ...
## $ NK_Cell : num 0.065 0.0601 0.0816 0.0986 0.0743 ...
## $ CD4T : num 0.0478 0.0467 0.0545 0.0658 0.0544 ...
## $ CD8T : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Monocyte : num 0.0685 0.05 0.081 0.0666 0.0651 ...
## $ Neutrophil : num 0 0 0 0 0 ...
## $ Eosinophil : num 0.0555 0.0457 0.0597 0.0632 0.0597 ...
#Check order.
identical(rownames(AMD_pData), colnames(M_values.funnorm.filt)) #TRUE. ## [1] TRUE
#Sanity check - there should be no NAs or infinite numbers - which could be a result of logit transformation of 0 or 1 beta values.
all(complete.cases(M_values.funnorm.filt)) == "TRUE" #TRUE - meaning no NA or infinite numbers. ## [1] TRUE
library(pbapply) #Progress bar for apply functions.
#EWAS on Age - All samples + SV + Chip.
#LM: Need to use transformed M-values instead of beta values as it is more statistically sound.
Age.SV.chip_LM_pval <- pbsapply(1:nrow(M_values.funnorm.filt), function(CpG){
meta <- AMD_pData
meta$Mval <- M_values.funnorm.filt[CpG,]
mod_Age.SV.chip <- lm(Mval ~ Age + SV + Chip, data = meta) #Only SV + Chip as covariate.
coef(summary(mod_Age.SV.chip))[2,4]}) #Returns nominal p-value for Age for model at each CpG.
head(Age.SV.chip_LM_pval)
save(Age.SV.chip_LM_pval, file = "~/KoborLab/kobor_space/kendrix/macular_degeneration/data/Age.SV.chip_LM_pval.RData")load("~/KoborLab/kobor_space/kendrix/macular_degeneration/data/Age.SV.chip_LM_pval.RData")
#Inspect p-value distribution for model.
pvalue_dist_Age.SV.chip <- data.frame(CpG = rownames(M_values.funnorm.filt), Nominal_P = Age.SV.chip_LM_pval)
ggplot(pvalue_dist_Age.SV.chip, aes(Nominal_P)) +
geom_histogram(fill = "grey90", color = "black") +
theme_classic() + xlab("Nominal P Value") +
ylim(0, 20000) +
xlim(min(Age.SV.chip_LM_pval), max(Age.SV.chip_LM_pval))## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values (geom_bar).
#Right-skewed.
#Multiple test correction with FDR.
M_values.funnorm.filt <- as.data.frame(M_values.funnorm.filt)
Multi_test_corr_relaxed <- p.adjust(Age.SV.chip_LM_pval, method = "fdr", n = length(Age.SV.chip_LM_pval))
#Looking at FDR thresholds for hits:
dim(as.data.frame(M_values.funnorm.filt)[which(Multi_test_corr_relaxed <= 0.05),]) #0 at 0.05.## [1] 0 44
dim(as.data.frame(M_values.funnorm.filt)[which(Multi_test_corr_relaxed <= 0.1),]) #1 at 0.1.## [1] 1 44
dim(as.data.frame(M_values.funnorm.filt)[which(Multi_test_corr_relaxed <= 0.2),]) #12 at 0.2.## [1] 12 44
#Looking at top hits by nominal P:
pvalue_dist_Age.SV.chip <- pvalue_dist_Age.SV.chip[order(pvalue_dist_Age.SV.chip$Nominal_P),]
head(pvalue_dist_Age.SV.chip)## CpG Nominal_P
## 51300 cg22197050 1.494583e-07
## 49545 cg21875802 5.393880e-07
## 291484 cg05037876 1.510287e-06
## 100675 cg25359907 1.980075e-06
## 347431 cg06596654 2.143129e-06
## 176395 cg00361495 3.970641e-06
#Load 450K annotation data.
load("~/KoborLab/kobor_space/shared_coding_resource/Illumina_EPIC/EPIC_Annotation_Complete.RData")
head(EPIC_Annotation_Complete)## Name AddressA_ID
## cg07881041 cg07881041 0085713262
## cg18478105 cg18478105 0046761277
## cg23229610 cg23229610 0021717843
## cg03513874 cg03513874 0029622133
## cg09835024 cg09835024 0016745152
## cg05451842 cg05451842 0016681196
## AlleleA_ProbeSeq AddressB_ID
## cg07881041 CTACAAATACAACACCCTCAACCCATATTTCATATATTATCTCATTTAAC
## cg18478105 AAATAAATTTCACTCTCAAATCCCAATCTCATACAACAAAACAAAAACCA 0086644198
## cg23229610 ATAAAATTCTTTCCTTAAAAAACATTAAAACCAAAATAAACAAAAATTCC
## cg03513874 ACAATAAAATAATAAAATCCCATCACTACTTACCCTCCTTAAATAATATC
## cg09835024 AATAAACACCAACCCCAAACCAATCTCACTTTATTAAATTACAAAAATCA 0081631976
## cg05451842 CRTTCAAATACACTATAACCCRACTAAAAAAACCCCCAACAACCCAAAAC
## AlleleB_ProbeSeq
## cg07881041
## cg18478105 AAATAAATTTCGCTCTCAAATCCCAATCTCGTACGACGAAACGAAAACCG
## cg23229610
## cg03513874
## cg09835024 AATAAACGCCGACCCCGAACCGATCTCGCTTTATTAAATTACAAAAATCG
## cg05451842
## Infinium_Design_Type Next_Base Color_Channel
## cg07881041 II
## cg18478105 I C Grn
## cg23229610 II
## cg03513874 II
## cg09835024 I A Red
## cg05451842 II
## Forward_Sequence
## cg07881041 CTGCACGCCTACTGCAGGTGCAGCACCCTCAGCCCATGTTTCATGTATTATCTCATTTAA[CG]CATGTATCATCTCATTTAATGCATGCATTATCTCATTTAATTCTCACAACCCCTCAGGTG
## cg18478105 TCCCGTCTTACGGGATGGATTTCGCTCTCAGGTCCCAGTCTCGTGCGGCGGGGCGGGGAC[CG]CAGCCGGCTGGGCGGGGAAGCCCTGAGCCGGGGAAGTCACGTGGGGCGTGTCCGGAGGCG
## cg23229610 GTTTCTGGACAGTAAAATTCTTTCCTTGAAGGACATTAGGGCCAAAATGGGCAAGGATTC[CG]AGATTGGTACATCGAGCGTTATCTTCCAACTCTCTTTTCTAAATGGGCTCATTTAGTAAT
## cg03513874 ATTGTGCCCACCTTGCTGCTGACAGTTAAGCATCACTAAAGTAGGAAATAGGGTCCAAAC[CG]ACACTACTTAAGGAGGGCAAGTAGTGATGGGACCTCATCATCCCATTGCTATCATGGAGC
## cg09835024 AGCCCCGTCATAGGTGGGCGCCGACCCCGAGCCGATCTCGCTTTATTAAATTACAGAAAT[CG]GTATTCAAAAAAAAAAAAAAAAAAGGGCGGGGAGGACACTCCCTCTTCTCTGTTCCCACA
## cg05451842 CACAGCGTGGATGCCCCGATTTCCCAGGTCCCTCCGCAACCCTCAGTAGAACTCCCACCG[CG]CCCTGGGCTGCTGGGGGCCTCCCCAGCCGGGTCACAGTGCACCTGAACGCCCCGGTCCGT
## Genome_Build CHR MAPINFO
## cg07881041 37 19 5236016
## cg18478105 37 20 61847650
## cg23229610 37 1 6841125
## cg03513874 37 2 198303466
## cg09835024 37 X 24072640
## cg05451842 37 14 93581139
## SourceSeq Strand
## cg07881041 TGCAGGTGCAGCACCCTCAGCCCATGTTTCATGTATTATCTCATTTAACG R
## cg18478105 CGGTCCCCGCCCCGCCGCACGAGACTGGGACCTGAGAGCGAAATCCATCC R
## cg23229610 CGGAATCCTTGCCCATTTTGGCCCTAATGTCCTTCAAGGAAAGAATTTTA R
## cg03513874 CAATGGGATGATGAGGTCCCATCACTACTTGCCCTCCTTAAGTAGTGTCG F
## cg09835024 GGTGGGCGCCGACCCCGAGCCGATCTCGCTTTATTAAATTACAGAAATCG R
## cg05451842 CGCCCTGGGCTGCTGGGGGCCTCCCCAGCCGGGTCACAGTGCACCTGAAC F
## UCSC_RefGene_Name UCSC_RefGene_Accession
## cg07881041 PTPRS;PTPRS;PTPRS;PTPRS NM_130855;NM_002850;NM_130854;NM_130853
## cg18478105 YTHDF1 NM_017798
## cg23229610
## cg03513874
## cg09835024 EIF2S3 NM_001415
## cg05451842 ITPK1;ITPK1;ITPK1 NM_001142593;NM_014216;NM_001142594
## UCSC_RefGene_Group UCSC_CpG_Islands_Name
## cg07881041 Body;Body;Body;Body chr19:5237294-5237669
## cg18478105 TSS200 chr20:61846843-61848103
## cg23229610 chr1:6844313-6846366
## cg03513874 chr2:198299244-198299972
## cg09835024 TSS1500 chrX:24072558-24073135
## cg05451842 Body;Body;Body chr14:93581083-93582797
## Relation_to_UCSC_CpG_Island Phantom4_Enhancers Phantom5_Enhancers
## cg07881041 N_Shore
## cg18478105 Island
## cg23229610 N_Shelf
## cg03513874 S_Shelf
## cg09835024 Island
## cg05451842 Island
## DMR X450k_Enhancer HMM_Island Regulatory_Feature_Name
## cg07881041 NA
## cg18478105 NA 20:61317142-61318498 20:61846284-61847956
## cg23229610 NA
## cg03513874 NA
## cg09835024 NA X:24071907-24073667
## cg05451842 NA 14:92650663-92652544
## Regulatory_Feature_Group GencodeBasicV12_NAME
## cg07881041
## cg18478105 Promoter_Associated YTHDF1;YTHDF1
## cg23229610
## cg03513874
## cg09835024 Promoter_Associated EIF2S3
## cg05451842 ITPK1
## GencodeBasicV12_Accession GencodeBasicV12_Group
## cg07881041
## cg18478105 ENST00000370334.4;ENST00000370339.3 TSS200;TSS200
## cg23229610
## cg03513874
## cg09835024 ENST00000253039.4 TSS200
## cg05451842 ENST00000555495.1 5'UTR
## GencodeCompV12_NAME
## cg07881041
## cg18478105 YTHDF1;YTHDF1
## cg23229610
## cg03513874
## cg09835024 EIF2S3;EIF2S3;EIF2S3
## cg05451842 ITPK1
## GencodeCompV12_Accession
## cg07881041
## cg18478105 ENST00000370334.4;ENST00000370339.3
## cg23229610
## cg03513874
## cg09835024 ENST00000487075.1;ENST00000423068.1;ENST00000253039.4
## cg05451842 ENST00000555495.1
## GencodeCompV12_Group DNase_Hypersensitivity_NAME
## cg07881041
## cg18478105 TSS200;TSS200 chr20:61847520-61847755
## cg23229610
## cg03513874
## cg09835024 TSS1500;TSS1500;TSS200 chrX:24072600-24073395
## cg05451842 5'UTR chr14:93581080-93581375
## DNase_Hypersensitivity_Evidence_Count OpenChromatin_NAME
## cg07881041 NA
## cg18478105 3
## cg23229610 NA
## cg03513874 NA
## cg09835024 3
## cg05451842 3
## OpenChromatin_Evidence_Count TFBS_NAME TFBS_Evidence_Count
## cg07881041 NA NA
## cg18478105 NA NA
## cg23229610 NA NA
## cg03513874 NA NA
## cg09835024 NA NA
## cg05451842 NA NA
## Methyl27_Loci Methyl450_Loci Chromosome_36 Coordinate_36
## cg07881041 NA TRUE 19 5187016
## cg18478105 NA TRUE 20 61318095
## cg23229610 NA TRUE 1 6763712
## cg03513874 NA TRUE 2 198011711
## cg09835024 NA TRUE X 23982561
## cg05451842 NA TRUE 14 92650892
## SNP_ID SNP_DISTANCE SNP_MinorAlleleFrequency
## cg07881041 rs187313142 18 0.000200
## cg18478105 rs549944121 5 0.001797
## cg23229610 rs545824288;rs527255711 40;12 0.000200;0.001198
## cg03513874
## cg09835024
## cg05451842 rs550745821 22 0.000200
## Random_Loci X strand Probe_rs Probe_maf CpG_rs CpG_maf SBE_rs
## cg07881041 NA NA - <NA> NA <NA> NA <NA>
## cg18478105 NA NA - <NA> NA <NA> NA <NA>
## cg23229610 NA NA - <NA> NA <NA> NA <NA>
## cg03513874 NA NA + <NA> NA <NA> NA <NA>
## cg09835024 NA NA - <NA> NA <NA> NA <NA>
## cg05451842 NA NA + <NA> NA <NA> NA <NA>
## SBE_maf CH_450_XY CH_450_Aut CH_EPIC Cross_Hyb
## cg07881041 NA No No No No
## cg18478105 NA No No No No
## cg23229610 NA No No No No
## cg03513874 NA No No No No
## cg09835024 NA No No No No
## cg05451842 NA No No No No
dim(hits_CpGs <- pvalue_dist_Age.SV.chip[which(pvalue_dist_Age.SV.chip$Nominal_P < 1e-6),]) #2 hits.## [1] 2 2
hits <- EPIC_Annotation_Complete[which(EPIC_Annotation_Complete$Name%in%hits_CpGs$CpG),]
hits$UCSC_RefGene_Name## [1] LOC100132215
## 66070 Levels: A1BG A1BG-AS1;A1BG A1BG-AS1;A1BG;ZNF497;ZNF497 ... ZZZ3;ZZZ3;ZZZ3
#Delta beta.
#Using Maggie's code for deltabeta:
deltabeta <- function(df, mainvar, covar1 = NULL, covar2 = NULL, covar3 = NULL, covar4 = NULL, covar5 = NULL) {
# Calculating delta beta of the main variable of interest (mainvar), with up to 5 possible covariates (covar)
# mainvar should be a vector of continuous variable
# all covars should also be vectors
# df = dataframe or matrix of beta values
# output is a vector of delta beta values
sd=sd(mainvar)
qt <-
range <- max(mainvar, na.rm = T) - min(mainvar, na.rm = T)
dB <- vector(mode = "numeric", length = nrow(df))
names(dB) <- rownames(df)
for (i in 1:nrow(df)) {
beta <- df[i, ]
if (is.null(covar1)) {
mod <- lm(beta ~ mainvar)
} else if (is.null(covar2)) {
mod <- lm(beta ~ mainvar + covar1)
} else if (is.null(covar3)) {
mod <- lm(beta ~ mainvar + covar1 + covar2)
} else if (is.null(covar4)) {
mod <- lm(beta ~ mainvar + covar1 + covar2 + covar3)
} else if (is.null(covar5)) {
mod <- lm(beta ~ mainvar + covar1 + covar2 + covar3 + covar4)
} else {
mod <- lm(beta ~ mainvar + covar1 + covar2 + covar3 + covar4 + covar5)
}
slope <- mod$coefficients[2]
dB[i] <- as.numeric(slope*range)
}
dB
}
betas.funnorm.filt <- m2beta(M_values.funnorm.filt)
delta_beta_Age.SV.chip_fixed <- deltabeta(as.matrix(betas.funnorm.filt), AMD_pData$Age, covar1 = AMD_pData$SV, covar2 = AMD_pData$Chip)
length(delta_beta_Age.SV.chip_fixed)
summary(delta_beta_Age.SV.chip_fixed)
save(delta_beta_Age.SV.chip_fixed, file = "~/KoborLab/kobor_space/kendrix/macular_degeneration/data/DB_Age.SV.chip_fixed.RData")##3. Linear Model: Volcano Plot
#Volcano to examine hits (for DB, see below chunks):
load("~/KoborLab/kobor_space/kendrix/macular_degeneration/data/DB_Age.SV.chip_fixed.RData")
#Call Volcano (Nominal p Version, modified from Rachel's code):
source("/home/BCRICWH.LAN/dlin/KoborLab/kobor_space/cake/home/dlin/Volcano_DL_Nominal.R")
#After running the last 2 chunks, make a summary table with CpG, Nominal_P, FDR, and Delta Beta.
Age.SV.chip_Table <- data.frame(rownames(M_values.funnorm.filt), Age.SV.chip_LM_pval, Multi_test_corr_relaxed, delta_beta_Age.SV.chip_fixed)
colnames(Age.SV.chip_Table) = c("CpG", "Nominal_P", "FDR", "Delta_Beta")
identical(as.character(rownames(Age.SV.chip_Table)), as.character(Age.SV.chip_Table$CpG)) #TRUE.## [1] TRUE
#Looking at top hits quickly without considering DB:
head(Age.SV.chip_Table[order(Age.SV.chip_Table$Nominal_P),],10)## CpG Nominal_P FDR Delta_Beta
## cg22197050 cg22197050 1.494583e-07 0.06358795 0.074437117
## cg21875802 cg21875802 5.393880e-07 0.11474293 0.154439604
## cg05037876 cg05037876 1.510287e-06 0.18236143 0.050491846
## cg25359907 cg25359907 1.980075e-06 0.18236143 0.043596572
## cg06596654 cg06596654 2.143129e-06 0.18236143 0.181927172
## cg00361495 cg00361495 3.970641e-06 0.19812786 0.167257219
## cg22166290 cg22166290 4.371720e-06 0.19812786 0.066983153
## cg22746333 cg22746333 4.424913e-06 0.19812786 0.032010515
## cg06173889 cg06173889 4.917622e-06 0.19812786 0.208709848
## cg14881459 cg14881459 5.347790e-06 0.19812786 -0.003442354
##Setting a threshold of 0.05DB, 5e-6 Nominal P (scale to 0.60DB):
makeVolcano_nominal(Age.SV.chip_Table$Nominal_P, Age.SV.chip_Table$Delta_Beta, 0.05, 1e-6, "DNAm changes", 0.3) #at 5e-6: 9 Hypermethylated, 0 Hypomethylated## [1] "Hypermethylated: 2"
## [1] "Hypomethylated: 0"
## Warning: Removed 291 rows containing missing values (geom_point).
#What are these hits?
#First make an annotated table - load 450K manifest.
load("~/KoborLab/kobor_space/shared_coding_resource/Illumina_EPIC/EPIC_Annotation_Complete.RData")
head(EPIC_Annotation_Complete)## Name AddressA_ID
## cg07881041 cg07881041 0085713262
## cg18478105 cg18478105 0046761277
## cg23229610 cg23229610 0021717843
## cg03513874 cg03513874 0029622133
## cg09835024 cg09835024 0016745152
## cg05451842 cg05451842 0016681196
## AlleleA_ProbeSeq AddressB_ID
## cg07881041 CTACAAATACAACACCCTCAACCCATATTTCATATATTATCTCATTTAAC
## cg18478105 AAATAAATTTCACTCTCAAATCCCAATCTCATACAACAAAACAAAAACCA 0086644198
## cg23229610 ATAAAATTCTTTCCTTAAAAAACATTAAAACCAAAATAAACAAAAATTCC
## cg03513874 ACAATAAAATAATAAAATCCCATCACTACTTACCCTCCTTAAATAATATC
## cg09835024 AATAAACACCAACCCCAAACCAATCTCACTTTATTAAATTACAAAAATCA 0081631976
## cg05451842 CRTTCAAATACACTATAACCCRACTAAAAAAACCCCCAACAACCCAAAAC
## AlleleB_ProbeSeq
## cg07881041
## cg18478105 AAATAAATTTCGCTCTCAAATCCCAATCTCGTACGACGAAACGAAAACCG
## cg23229610
## cg03513874
## cg09835024 AATAAACGCCGACCCCGAACCGATCTCGCTTTATTAAATTACAAAAATCG
## cg05451842
## Infinium_Design_Type Next_Base Color_Channel
## cg07881041 II
## cg18478105 I C Grn
## cg23229610 II
## cg03513874 II
## cg09835024 I A Red
## cg05451842 II
## Forward_Sequence
## cg07881041 CTGCACGCCTACTGCAGGTGCAGCACCCTCAGCCCATGTTTCATGTATTATCTCATTTAA[CG]CATGTATCATCTCATTTAATGCATGCATTATCTCATTTAATTCTCACAACCCCTCAGGTG
## cg18478105 TCCCGTCTTACGGGATGGATTTCGCTCTCAGGTCCCAGTCTCGTGCGGCGGGGCGGGGAC[CG]CAGCCGGCTGGGCGGGGAAGCCCTGAGCCGGGGAAGTCACGTGGGGCGTGTCCGGAGGCG
## cg23229610 GTTTCTGGACAGTAAAATTCTTTCCTTGAAGGACATTAGGGCCAAAATGGGCAAGGATTC[CG]AGATTGGTACATCGAGCGTTATCTTCCAACTCTCTTTTCTAAATGGGCTCATTTAGTAAT
## cg03513874 ATTGTGCCCACCTTGCTGCTGACAGTTAAGCATCACTAAAGTAGGAAATAGGGTCCAAAC[CG]ACACTACTTAAGGAGGGCAAGTAGTGATGGGACCTCATCATCCCATTGCTATCATGGAGC
## cg09835024 AGCCCCGTCATAGGTGGGCGCCGACCCCGAGCCGATCTCGCTTTATTAAATTACAGAAAT[CG]GTATTCAAAAAAAAAAAAAAAAAAGGGCGGGGAGGACACTCCCTCTTCTCTGTTCCCACA
## cg05451842 CACAGCGTGGATGCCCCGATTTCCCAGGTCCCTCCGCAACCCTCAGTAGAACTCCCACCG[CG]CCCTGGGCTGCTGGGGGCCTCCCCAGCCGGGTCACAGTGCACCTGAACGCCCCGGTCCGT
## Genome_Build CHR MAPINFO
## cg07881041 37 19 5236016
## cg18478105 37 20 61847650
## cg23229610 37 1 6841125
## cg03513874 37 2 198303466
## cg09835024 37 X 24072640
## cg05451842 37 14 93581139
## SourceSeq Strand
## cg07881041 TGCAGGTGCAGCACCCTCAGCCCATGTTTCATGTATTATCTCATTTAACG R
## cg18478105 CGGTCCCCGCCCCGCCGCACGAGACTGGGACCTGAGAGCGAAATCCATCC R
## cg23229610 CGGAATCCTTGCCCATTTTGGCCCTAATGTCCTTCAAGGAAAGAATTTTA R
## cg03513874 CAATGGGATGATGAGGTCCCATCACTACTTGCCCTCCTTAAGTAGTGTCG F
## cg09835024 GGTGGGCGCCGACCCCGAGCCGATCTCGCTTTATTAAATTACAGAAATCG R
## cg05451842 CGCCCTGGGCTGCTGGGGGCCTCCCCAGCCGGGTCACAGTGCACCTGAAC F
## UCSC_RefGene_Name UCSC_RefGene_Accession
## cg07881041 PTPRS;PTPRS;PTPRS;PTPRS NM_130855;NM_002850;NM_130854;NM_130853
## cg18478105 YTHDF1 NM_017798
## cg23229610
## cg03513874
## cg09835024 EIF2S3 NM_001415
## cg05451842 ITPK1;ITPK1;ITPK1 NM_001142593;NM_014216;NM_001142594
## UCSC_RefGene_Group UCSC_CpG_Islands_Name
## cg07881041 Body;Body;Body;Body chr19:5237294-5237669
## cg18478105 TSS200 chr20:61846843-61848103
## cg23229610 chr1:6844313-6846366
## cg03513874 chr2:198299244-198299972
## cg09835024 TSS1500 chrX:24072558-24073135
## cg05451842 Body;Body;Body chr14:93581083-93582797
## Relation_to_UCSC_CpG_Island Phantom4_Enhancers Phantom5_Enhancers
## cg07881041 N_Shore
## cg18478105 Island
## cg23229610 N_Shelf
## cg03513874 S_Shelf
## cg09835024 Island
## cg05451842 Island
## DMR X450k_Enhancer HMM_Island Regulatory_Feature_Name
## cg07881041 NA
## cg18478105 NA 20:61317142-61318498 20:61846284-61847956
## cg23229610 NA
## cg03513874 NA
## cg09835024 NA X:24071907-24073667
## cg05451842 NA 14:92650663-92652544
## Regulatory_Feature_Group GencodeBasicV12_NAME
## cg07881041
## cg18478105 Promoter_Associated YTHDF1;YTHDF1
## cg23229610
## cg03513874
## cg09835024 Promoter_Associated EIF2S3
## cg05451842 ITPK1
## GencodeBasicV12_Accession GencodeBasicV12_Group
## cg07881041
## cg18478105 ENST00000370334.4;ENST00000370339.3 TSS200;TSS200
## cg23229610
## cg03513874
## cg09835024 ENST00000253039.4 TSS200
## cg05451842 ENST00000555495.1 5'UTR
## GencodeCompV12_NAME
## cg07881041
## cg18478105 YTHDF1;YTHDF1
## cg23229610
## cg03513874
## cg09835024 EIF2S3;EIF2S3;EIF2S3
## cg05451842 ITPK1
## GencodeCompV12_Accession
## cg07881041
## cg18478105 ENST00000370334.4;ENST00000370339.3
## cg23229610
## cg03513874
## cg09835024 ENST00000487075.1;ENST00000423068.1;ENST00000253039.4
## cg05451842 ENST00000555495.1
## GencodeCompV12_Group DNase_Hypersensitivity_NAME
## cg07881041
## cg18478105 TSS200;TSS200 chr20:61847520-61847755
## cg23229610
## cg03513874
## cg09835024 TSS1500;TSS1500;TSS200 chrX:24072600-24073395
## cg05451842 5'UTR chr14:93581080-93581375
## DNase_Hypersensitivity_Evidence_Count OpenChromatin_NAME
## cg07881041 NA
## cg18478105 3
## cg23229610 NA
## cg03513874 NA
## cg09835024 3
## cg05451842 3
## OpenChromatin_Evidence_Count TFBS_NAME TFBS_Evidence_Count
## cg07881041 NA NA
## cg18478105 NA NA
## cg23229610 NA NA
## cg03513874 NA NA
## cg09835024 NA NA
## cg05451842 NA NA
## Methyl27_Loci Methyl450_Loci Chromosome_36 Coordinate_36
## cg07881041 NA TRUE 19 5187016
## cg18478105 NA TRUE 20 61318095
## cg23229610 NA TRUE 1 6763712
## cg03513874 NA TRUE 2 198011711
## cg09835024 NA TRUE X 23982561
## cg05451842 NA TRUE 14 92650892
## SNP_ID SNP_DISTANCE SNP_MinorAlleleFrequency
## cg07881041 rs187313142 18 0.000200
## cg18478105 rs549944121 5 0.001797
## cg23229610 rs545824288;rs527255711 40;12 0.000200;0.001198
## cg03513874
## cg09835024
## cg05451842 rs550745821 22 0.000200
## Random_Loci X strand Probe_rs Probe_maf CpG_rs CpG_maf SBE_rs
## cg07881041 NA NA - <NA> NA <NA> NA <NA>
## cg18478105 NA NA - <NA> NA <NA> NA <NA>
## cg23229610 NA NA - <NA> NA <NA> NA <NA>
## cg03513874 NA NA + <NA> NA <NA> NA <NA>
## cg09835024 NA NA - <NA> NA <NA> NA <NA>
## cg05451842 NA NA + <NA> NA <NA> NA <NA>
## SBE_maf CH_450_XY CH_450_Aut CH_EPIC Cross_Hyb
## cg07881041 NA No No No No
## cg18478105 NA No No No No
## cg23229610 NA No No No No
## cg03513874 NA No No No No
## cg09835024 NA No No No No
## cg05451842 NA No No No No
Age.SV.chip_Table.annotated = merge(Age.SV.chip_Table, EPIC_Annotation_Complete[,c("Name", "CHR", "Strand", "UCSC_RefGene_Name", "UCSC_RefGene_Group")], by.x = "CpG", by.y = "Name", all = FALSE)
colnames(Age.SV.chip_Table.annotated)[5:6] = c("Chromosome", "Coordinate")
Age.SV.chip_Table.annotated <- Age.SV.chip_Table.annotated[order(Age.SV.chip_Table.annotated$Nominal_P),]
#Grabbing the Volcano hits:
LM_Age.SV.chip_Hits <- Age.SV.chip_Table.annotated[which(abs(Age.SV.chip_Table.annotated$Delta_Beta)>0.05 & Age.SV.chip_Table.annotated$Nominal_P<1e-6),]
#Let's order by Nominal_P:
LM_Age.SV.chip_Hits = LM_Age.SV.chip_Hits[order(LM_Age.SV.chip_Hits$Nominal_P),]
rownames(LM_Age.SV.chip_Hits) = c()
str(LM_Age.SV.chip_Hits)## 'data.frame': 2 obs. of 8 variables:
## $ CpG : Factor w/ 425456 levels "cg00000029","cg00000108",..: 345056 340924
## $ Nominal_P : num 1.49e-07 5.39e-07
## $ FDR : num 0.0636 0.1147
## $ Delta_Beta : num 0.0744 0.1544
## $ Chromosome : Factor w/ 25 levels "","1","10","11",..: 13 13
## $ Coordinate : Factor w/ 3 levels "","F","R": 3 3
## $ UCSC_RefGene_Name : Factor w/ 66070 levels "","A1BG","A1BG-AS1;A1BG",..: 30522 1
## $ UCSC_RefGene_Group: Factor w/ 8044 levels "","1stExon","1stExon;1stExon",..: 5441 1
head(LM_Age.SV.chip_Hits)## CpG Nominal_P FDR Delta_Beta Chromosome Coordinate
## 1 cg22197050 1.494583e-07 0.06358795 0.07443712 2 R
## 2 cg21875802 5.393880e-07 0.11474293 0.15443960 2 R
## UCSC_RefGene_Name UCSC_RefGene_Group
## 1 LOC100132215 TSS1500
## 2
Age-associated hits: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3482848/ Non-tissue specific hits: https://epigeneticsandchromatin.biomedcentral.com/articles/10.1186/s13072-018-0191-3
#Reorder row index.
rownames(Age.SV.chip_Table.annotated) <- NULL
#Look at hits based on candidate genes.
head(Age.SV.chip_Table.annotated[which(Age.SV.chip_Table.annotated$CpG %in% c("cg16867657", "cg21572722", "cg24724428")),]) #Identified ELOVL2 sites.## CpG Nominal_P FDR Delta_Beta Chromosome Coordinate
## 317 cg16867657 0.0004391873 0.5528210 0.09583086 6 F
## 1925 cg24724428 0.0030391951 0.6188314 0.10685771 6 F
## 13843 cg21572722 0.0265274260 0.7303255 0.05046642 6 F
## UCSC_RefGene_Name UCSC_RefGene_Group
## 317 ELOVL2 TSS1500
## 1925 ELOVL2 TSS1500
## 13843 ELOVL2 TSS1500
head(Age.SV.chip_Table.annotated[which(Age.SV.chip_Table.annotated$CpG %in% c("cg06639320", "cg22454769", "cg24079702")),]) #Identified FHL2 sites.## CpG Nominal_P FDR Delta_Beta Chromosome Coordinate
## 13065 cg22454769 0.0248804 0.7252093 0.05827410 2 R
## 146874 cg06639320 0.3367485 0.9043558 0.02140123 2 R
## UCSC_RefGene_Name UCSC_RefGene_Group
## 13065 FHL2;FHL2;FHL2;FHL2 TSS200;TSS200;5'UTR;TSS200
## 146874 FHL2;FHL2;FHL2;FHL2 TSS200;TSS200;5'UTR;TSS200
head(Age.SV.chip_Table.annotated[which(Age.SV.chip_Table.annotated$UCSC_RefGene_Name == "DIP2C"),]) #First TOM1L1 hit == row 3937.## CpG Nominal_P FDR Delta_Beta Chromosome Coordinate
## 132 cg19026125 0.0001882527 0.5405491 0.03074610 10 F
## 2649 cg04759187 0.0043212306 0.6329451 0.03315152 10 R
## 4980 cg19335436 0.0087526527 0.6716375 0.03228136 10 F
## 4993 cg04825276 0.0087756584 0.6716375 0.09008137 10 R
## 7664 cg06042849 0.0137971237 0.6817064 0.07309830 10 F
## 8686 cg16528305 0.0158812181 0.6900287 0.01924932 10 R
## UCSC_RefGene_Name UCSC_RefGene_Group
## 132 DIP2C Body
## 2649 DIP2C Body
## 4980 DIP2C Body
## 4993 DIP2C Body
## 7664 DIP2C Body
## 8686 DIP2C Body
head(Age.SV.chip_Table.annotated[which(Age.SV.chip_Table.annotated$UCSC_RefGene_Name == "NPTX2"),]) #First NPTX2 hit == row 6776.## CpG Nominal_P FDR Delta_Beta Chromosome Coordinate
## 6062 cg02368096 0.01073513 0.6731179 0.06982893 7 R
## 18378 cg13878520 0.03600149 0.7512299 0.03509352 7 R
## 27591 cg13585675 0.05554126 0.7794433 0.04179393 7 F
## 34876 cg13695954 0.07149678 0.7969352 0.01402217 7 F
## 54116 cg05168977 0.11518763 0.8320291 -0.04176249 7 F
## 63038 cg07666532 0.13566841 0.8430489 0.03103397 7 F
## UCSC_RefGene_Name UCSC_RefGene_Group
## 6062 NPTX2 Body
## 18378 NPTX2 Body
## 27591 NPTX2 Body
## 34876 NPTX2 Body
## 54116 NPTX2 1stExon
## 63038 NPTX2 Body
head(Age.SV.chip_Table.annotated[which(Age.SV.chip_Table.annotated$UCSC_RefGene_Name == "PENK"),]) #First NPTX2 hit == row 6776.## CpG Nominal_P FDR Delta_Beta Chromosome Coordinate
## 8369 cg16419235 0.01523717 0.6874777 0.055805237 8 R
## 78619 cg16219603 0.17179867 0.8578390 0.024079036 8 R
## 95990 cg03650233 0.21240700 0.8699754 0.042896140 8 R
## 126250 cg11610346 0.28559578 0.8915285 -0.009300056 8 R
## 230382 cg16072688 0.55096302 0.9446666 0.012854411 8 R
## UCSC_RefGene_Name UCSC_RefGene_Group
## 8369 PENK TSS1500
## 78619 PENK TSS1500
## 95990 PENK TSS1500
## 126250 PENK TSS1500
## 230382 PENK TSS1500
Sex and SV (cell type) as covariates.
#Load all the objects.
load("~/KoborLab/kobor_space/kendrix/macular_degeneration/data/AMD_SVA_M_values.funnorm.RData")
dim(AMD_pData) #44 samples.## [1] 44 22
dim(M_values.funnorm.filt) #425456 probes.## [1] 425456 44
AMD_pData$Disease_State <- as.factor(AMD_pData$Disease_State)
AMD_pData$Sex <- as.factor(AMD_pData$Sex)
AMD_pData$Row <- as.factor(AMD_pData$Row)
AMD_pData$Chip <- as.factor(AMD_pData$Chip)
AMD_pData$Age <- as.numeric(AMD_pData$Age)
str(AMD_pData)## 'data.frame': 44 obs. of 22 variables:
## $ Sample_Name : chr "Sample 1" "Sample 10" "Sample 11" "Sample 12" ...
## $ Disease_State: Factor w/ 2 levels "age-related macular degeneration",..: 2 2 2 1 1 1 2 1 1 1 ...
## $ Sex : Factor w/ 2 levels "F","M": 2 2 2 2 2 1 2 2 1 2 ...
## $ Age : num 61 74 70 76 79 89 66 70 83 76 ...
## $ Tissue : chr "pigmented layer of retina" "pigmented layer of retina" "pigmented layer of retina" "pigmented layer of retina" ...
## $ Row : Factor w/ 12 levels "R01C01","R01C02",..: 3 10 12 1 3 5 7 9 11 2 ...
## $ Chip : Factor w/ 4 levels "200723300084",..: 2 2 2 4 4 4 4 4 4 4 ...
## $ Basename : chr "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200723300089_R02C01" "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200723300089_R05C02" "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200723300089_R06C02" "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200770460039_R01C01" ...
## $ filenames : chr "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200723300089_R02C01" "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200723300089_R05C02" "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200723300089_R06C02" "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200770460039_R01C01" ...
## $ xMed : num 11.4 11.7 11.9 11.7 11.6 ...
## $ yMed : num 11.7 12 12.1 11.9 11.9 ...
## $ predictedSex : chr "M" "M" "M" "M" ...
## $ SV : num -0.1222 -0.011 -0.1074 -0.073 -0.0265 ...
## $ Epithelial : num 0.387 0.436 0.367 0.413 0.387 ...
## $ Fibroblast : num 0.326 0.316 0.298 0.24 0.312 ...
## $ B_Cell : num 0.0502 0.0454 0.0582 0.0528 0.0482 ...
## $ NK_Cell : num 0.065 0.0601 0.0816 0.0986 0.0743 ...
## $ CD4T : num 0.0478 0.0467 0.0545 0.0658 0.0544 ...
## $ CD8T : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Monocyte : num 0.0685 0.05 0.081 0.0666 0.0651 ...
## $ Neutrophil : num 0 0 0 0 0 ...
## $ Eosinophil : num 0.0555 0.0457 0.0597 0.0632 0.0597 ...
#Check order.
identical(rownames(AMD_pData), colnames(M_values.funnorm.filt)) #TRUE. ## [1] TRUE
#Sanity check - there should be no NAs or infinite numbers - which could be a result of logit transformation of 0 or 1 beta values.
all(complete.cases(M_values.funnorm.filt)) == "TRUE" #TRUE - meaning no NA or infinite numbers. ## [1] TRUE
library(pbapply) #Progress bar for apply functions.
#EWAS on Age - All samples + Sex + SV.
#LM: Need to use transformed M-values instead of beta values as it is more statistically sound.
Age.Sex.SV_LM_pval <- pbsapply(1:nrow(M_values.funnorm.filt), function(CpG){
meta <- AMD_pData
meta$Mval <- M_values.funnorm.filt[CpG,]
mod_Age.Sex.SV <- lm(Mval ~ Age + Sex + SV, data = meta) #Only Sex + SV as covariate.
coef(summary(mod_Age.Sex.SV))[2,4]}) #Returns nominal p-value for Age for model at each CpG.
head(Age.Sex.SV_LM_pval)
save(Age.Sex.SV_LM_pval, file = "~/KoborLab/kobor_space/kendrix/macular_degeneration/data/Age.Sex.SV_LM_pval.RData")load("~/KoborLab/kobor_space/kendrix/macular_degeneration/data/Age.Sex.SV_LM_pval.RData")
#Inspect p-value distribution for model.
pvalue_dist_Age.Sex.SV <- data.frame(CpG = rownames(M_values.funnorm.filt), Nominal_P = Age.Sex.SV_LM_pval)
ggplot(pvalue_dist_Age.Sex.SV, aes(Nominal_P)) +
geom_histogram(fill = "grey90", color = "black") +
theme_classic() + xlab("Nominal P Value") +
ylim(0, 20000) +
xlim(min(Age.Sex.SV_LM_pval), max(Age.Sex.SV_LM_pval))## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values (geom_bar).
#Not right-skewed. Distribution is a little even.
#Multiple test correction with FDR.
M_values.funnorm.filt <- as.data.frame(M_values.funnorm.filt)
Multi_test_corr_relaxed <- p.adjust(Age.Sex.SV_LM_pval, method = "fdr", n = length(Age.Sex.SV_LM_pval))
#Looking at FDR thresholds for hits:
dim(as.data.frame(M_values.funnorm.filt)[which(Multi_test_corr_relaxed <= 0.05),]) #20 at 0.05.## [1] 20 44
dim(as.data.frame(M_values.funnorm.filt)[which(Multi_test_corr_relaxed <= 0.1),]) #49 at 0.1.## [1] 49 44
dim(as.data.frame(M_values.funnorm.filt)[which(Multi_test_corr_relaxed <= 0.2),]) #143 at 0.2.## [1] 143 44
#Looking at top hits by nominal P:
pvalue_dist_Age.Sex.SV <- pvalue_dist_Age.Sex.SV[order(pvalue_dist_Age.Sex.SV$Nominal_P),]
head(pvalue_dist_Age.Sex.SV)## CpG Nominal_P
## 100675 cg25359907 9.561427e-08
## 323108 cg19005438 3.161929e-07
## 395001 cg04253011 3.515988e-07
## 245266 cg15861585 6.388047e-07
## 51048 cg06528150 7.848523e-07
## 75729 cg15155209 8.500193e-07
#Load 450K annotation data.
load("~/KoborLab/kobor_space/shared_coding_resource/Illumina_EPIC/EPIC_Annotation_Complete.RData")
head(EPIC_Annotation_Complete)## Name AddressA_ID
## cg07881041 cg07881041 0085713262
## cg18478105 cg18478105 0046761277
## cg23229610 cg23229610 0021717843
## cg03513874 cg03513874 0029622133
## cg09835024 cg09835024 0016745152
## cg05451842 cg05451842 0016681196
## AlleleA_ProbeSeq AddressB_ID
## cg07881041 CTACAAATACAACACCCTCAACCCATATTTCATATATTATCTCATTTAAC
## cg18478105 AAATAAATTTCACTCTCAAATCCCAATCTCATACAACAAAACAAAAACCA 0086644198
## cg23229610 ATAAAATTCTTTCCTTAAAAAACATTAAAACCAAAATAAACAAAAATTCC
## cg03513874 ACAATAAAATAATAAAATCCCATCACTACTTACCCTCCTTAAATAATATC
## cg09835024 AATAAACACCAACCCCAAACCAATCTCACTTTATTAAATTACAAAAATCA 0081631976
## cg05451842 CRTTCAAATACACTATAACCCRACTAAAAAAACCCCCAACAACCCAAAAC
## AlleleB_ProbeSeq
## cg07881041
## cg18478105 AAATAAATTTCGCTCTCAAATCCCAATCTCGTACGACGAAACGAAAACCG
## cg23229610
## cg03513874
## cg09835024 AATAAACGCCGACCCCGAACCGATCTCGCTTTATTAAATTACAAAAATCG
## cg05451842
## Infinium_Design_Type Next_Base Color_Channel
## cg07881041 II
## cg18478105 I C Grn
## cg23229610 II
## cg03513874 II
## cg09835024 I A Red
## cg05451842 II
## Forward_Sequence
## cg07881041 CTGCACGCCTACTGCAGGTGCAGCACCCTCAGCCCATGTTTCATGTATTATCTCATTTAA[CG]CATGTATCATCTCATTTAATGCATGCATTATCTCATTTAATTCTCACAACCCCTCAGGTG
## cg18478105 TCCCGTCTTACGGGATGGATTTCGCTCTCAGGTCCCAGTCTCGTGCGGCGGGGCGGGGAC[CG]CAGCCGGCTGGGCGGGGAAGCCCTGAGCCGGGGAAGTCACGTGGGGCGTGTCCGGAGGCG
## cg23229610 GTTTCTGGACAGTAAAATTCTTTCCTTGAAGGACATTAGGGCCAAAATGGGCAAGGATTC[CG]AGATTGGTACATCGAGCGTTATCTTCCAACTCTCTTTTCTAAATGGGCTCATTTAGTAAT
## cg03513874 ATTGTGCCCACCTTGCTGCTGACAGTTAAGCATCACTAAAGTAGGAAATAGGGTCCAAAC[CG]ACACTACTTAAGGAGGGCAAGTAGTGATGGGACCTCATCATCCCATTGCTATCATGGAGC
## cg09835024 AGCCCCGTCATAGGTGGGCGCCGACCCCGAGCCGATCTCGCTTTATTAAATTACAGAAAT[CG]GTATTCAAAAAAAAAAAAAAAAAAGGGCGGGGAGGACACTCCCTCTTCTCTGTTCCCACA
## cg05451842 CACAGCGTGGATGCCCCGATTTCCCAGGTCCCTCCGCAACCCTCAGTAGAACTCCCACCG[CG]CCCTGGGCTGCTGGGGGCCTCCCCAGCCGGGTCACAGTGCACCTGAACGCCCCGGTCCGT
## Genome_Build CHR MAPINFO
## cg07881041 37 19 5236016
## cg18478105 37 20 61847650
## cg23229610 37 1 6841125
## cg03513874 37 2 198303466
## cg09835024 37 X 24072640
## cg05451842 37 14 93581139
## SourceSeq Strand
## cg07881041 TGCAGGTGCAGCACCCTCAGCCCATGTTTCATGTATTATCTCATTTAACG R
## cg18478105 CGGTCCCCGCCCCGCCGCACGAGACTGGGACCTGAGAGCGAAATCCATCC R
## cg23229610 CGGAATCCTTGCCCATTTTGGCCCTAATGTCCTTCAAGGAAAGAATTTTA R
## cg03513874 CAATGGGATGATGAGGTCCCATCACTACTTGCCCTCCTTAAGTAGTGTCG F
## cg09835024 GGTGGGCGCCGACCCCGAGCCGATCTCGCTTTATTAAATTACAGAAATCG R
## cg05451842 CGCCCTGGGCTGCTGGGGGCCTCCCCAGCCGGGTCACAGTGCACCTGAAC F
## UCSC_RefGene_Name UCSC_RefGene_Accession
## cg07881041 PTPRS;PTPRS;PTPRS;PTPRS NM_130855;NM_002850;NM_130854;NM_130853
## cg18478105 YTHDF1 NM_017798
## cg23229610
## cg03513874
## cg09835024 EIF2S3 NM_001415
## cg05451842 ITPK1;ITPK1;ITPK1 NM_001142593;NM_014216;NM_001142594
## UCSC_RefGene_Group UCSC_CpG_Islands_Name
## cg07881041 Body;Body;Body;Body chr19:5237294-5237669
## cg18478105 TSS200 chr20:61846843-61848103
## cg23229610 chr1:6844313-6846366
## cg03513874 chr2:198299244-198299972
## cg09835024 TSS1500 chrX:24072558-24073135
## cg05451842 Body;Body;Body chr14:93581083-93582797
## Relation_to_UCSC_CpG_Island Phantom4_Enhancers Phantom5_Enhancers
## cg07881041 N_Shore
## cg18478105 Island
## cg23229610 N_Shelf
## cg03513874 S_Shelf
## cg09835024 Island
## cg05451842 Island
## DMR X450k_Enhancer HMM_Island Regulatory_Feature_Name
## cg07881041 NA
## cg18478105 NA 20:61317142-61318498 20:61846284-61847956
## cg23229610 NA
## cg03513874 NA
## cg09835024 NA X:24071907-24073667
## cg05451842 NA 14:92650663-92652544
## Regulatory_Feature_Group GencodeBasicV12_NAME
## cg07881041
## cg18478105 Promoter_Associated YTHDF1;YTHDF1
## cg23229610
## cg03513874
## cg09835024 Promoter_Associated EIF2S3
## cg05451842 ITPK1
## GencodeBasicV12_Accession GencodeBasicV12_Group
## cg07881041
## cg18478105 ENST00000370334.4;ENST00000370339.3 TSS200;TSS200
## cg23229610
## cg03513874
## cg09835024 ENST00000253039.4 TSS200
## cg05451842 ENST00000555495.1 5'UTR
## GencodeCompV12_NAME
## cg07881041
## cg18478105 YTHDF1;YTHDF1
## cg23229610
## cg03513874
## cg09835024 EIF2S3;EIF2S3;EIF2S3
## cg05451842 ITPK1
## GencodeCompV12_Accession
## cg07881041
## cg18478105 ENST00000370334.4;ENST00000370339.3
## cg23229610
## cg03513874
## cg09835024 ENST00000487075.1;ENST00000423068.1;ENST00000253039.4
## cg05451842 ENST00000555495.1
## GencodeCompV12_Group DNase_Hypersensitivity_NAME
## cg07881041
## cg18478105 TSS200;TSS200 chr20:61847520-61847755
## cg23229610
## cg03513874
## cg09835024 TSS1500;TSS1500;TSS200 chrX:24072600-24073395
## cg05451842 5'UTR chr14:93581080-93581375
## DNase_Hypersensitivity_Evidence_Count OpenChromatin_NAME
## cg07881041 NA
## cg18478105 3
## cg23229610 NA
## cg03513874 NA
## cg09835024 3
## cg05451842 3
## OpenChromatin_Evidence_Count TFBS_NAME TFBS_Evidence_Count
## cg07881041 NA NA
## cg18478105 NA NA
## cg23229610 NA NA
## cg03513874 NA NA
## cg09835024 NA NA
## cg05451842 NA NA
## Methyl27_Loci Methyl450_Loci Chromosome_36 Coordinate_36
## cg07881041 NA TRUE 19 5187016
## cg18478105 NA TRUE 20 61318095
## cg23229610 NA TRUE 1 6763712
## cg03513874 NA TRUE 2 198011711
## cg09835024 NA TRUE X 23982561
## cg05451842 NA TRUE 14 92650892
## SNP_ID SNP_DISTANCE SNP_MinorAlleleFrequency
## cg07881041 rs187313142 18 0.000200
## cg18478105 rs549944121 5 0.001797
## cg23229610 rs545824288;rs527255711 40;12 0.000200;0.001198
## cg03513874
## cg09835024
## cg05451842 rs550745821 22 0.000200
## Random_Loci X strand Probe_rs Probe_maf CpG_rs CpG_maf SBE_rs
## cg07881041 NA NA - <NA> NA <NA> NA <NA>
## cg18478105 NA NA - <NA> NA <NA> NA <NA>
## cg23229610 NA NA - <NA> NA <NA> NA <NA>
## cg03513874 NA NA + <NA> NA <NA> NA <NA>
## cg09835024 NA NA - <NA> NA <NA> NA <NA>
## cg05451842 NA NA + <NA> NA <NA> NA <NA>
## SBE_maf CH_450_XY CH_450_Aut CH_EPIC Cross_Hyb
## cg07881041 NA No No No No
## cg18478105 NA No No No No
## cg23229610 NA No No No No
## cg03513874 NA No No No No
## cg09835024 NA No No No No
## cg05451842 NA No No No No
dim(hits_CpGs <- pvalue_dist_Age.Sex.SV[which(pvalue_dist_Age.Sex.SV$Nominal_P < 1e-6),]) #8 hits.## [1] 8 2
hits <- EPIC_Annotation_Complete[which(EPIC_Annotation_Complete$Name%in%hits_CpGs$CpG),]
hits$UCSC_RefGene_Name## [1] C2CD4D;LOC100132111 DRD4
## [3] LOC84740
## [5] KIAA1841;KIAA1841;KIAA1841 FOXB1
## [7] PLEKHG2 SLC6A6;SLC6A6
## 66070 Levels: A1BG A1BG-AS1;A1BG A1BG-AS1;A1BG;ZNF497;ZNF497 ... ZZZ3;ZZZ3;ZZZ3
#Delta beta.
#Using Maggie's code for deltabeta:
deltabeta <- function(df, mainvar, covar1 = NULL, covar2 = NULL, covar3 = NULL, covar4 = NULL, covar5 = NULL) {
# Calculating delta beta of the main variable of interest (mainvar), with up to 5 possible covariates (covar)
# mainvar should be a vector of continuous variable
# all covars should also be vectors
# df = dataframe or matrix of beta values
# output is a vector of delta beta values
sd=sd(mainvar)
qt <-
range <- max(mainvar, na.rm = T) - min(mainvar, na.rm = T)
dB <- vector(mode = "numeric", length = nrow(df))
names(dB) <- rownames(df)
for (i in 1:nrow(df)) {
beta <- df[i, ]
if (is.null(covar1)) {
mod <- lm(beta ~ mainvar)
} else if (is.null(covar2)) {
mod <- lm(beta ~ mainvar + covar1)
} else if (is.null(covar3)) {
mod <- lm(beta ~ mainvar + covar1 + covar2)
} else if (is.null(covar4)) {
mod <- lm(beta ~ mainvar + covar1 + covar2 + covar3)
} else if (is.null(covar5)) {
mod <- lm(beta ~ mainvar + covar1 + covar2 + covar3 + covar4)
} else {
mod <- lm(beta ~ mainvar + covar1 + covar2 + covar3 + covar4 + covar5)
}
slope <- mod$coefficients[2]
dB[i] <- as.numeric(slope*range)
}
dB
}
betas.funnorm.filt <- m2beta(M_values.funnorm.filt)
delta_beta_Age.Sex.SV_fixed <- deltabeta(as.matrix(betas.funnorm.filt), AMD_pData$Age, covar1 = AMD_pData$Sex, covar2 = AMD_pData$SV)
length(delta_beta_Age.Sex.SV_fixed)
summary(delta_beta_Age.Sex.SV_fixed)
save(delta_beta_Age.Sex.SV_fixed, file = "~/KoborLab/kobor_space/kendrix/macular_degeneration/data/DB_Age.Sex.SV_fixed.RData")##3. Linear Model: Volcano Plot
#Volcano to examine hits (for DB, see below chunks):
load("~/KoborLab/kobor_space/kendrix/macular_degeneration/data/DB_Age.Sex.SV_fixed.RData")
#Call Volcano (Nominal p Version, modified from Rachel's code):
source("/home/BCRICWH.LAN/dlin/KoborLab/kobor_space/cake/home/dlin/Volcano_DL_Nominal.R")
#After running the last 2 chunks, make a summary table with CpG, Nominal_P, FDR, and Delta Beta.
Age.Sex.SV_Table <- data.frame(rownames(M_values.funnorm.filt), Age.Sex.SV_LM_pval, Multi_test_corr_relaxed, delta_beta_Age.Sex.SV_fixed)
colnames(Age.Sex.SV_Table) = c("CpG", "Nominal_P", "FDR", "Delta_Beta")
identical(as.character(rownames(Age.Sex.SV_Table)), as.character(Age.Sex.SV_Table$CpG)) #TRUE.## [1] TRUE
#Looking at top hits quickly without considering DB:
head(Age.Sex.SV_Table[order(Age.Sex.SV_Table$Nominal_P),],10)## CpG Nominal_P FDR Delta_Beta
## cg25359907 cg25359907 9.561427e-08 0.04067967 0.05135459
## cg19005438 cg19005438 3.161929e-07 0.04814805 0.15230396
## cg04253011 cg04253011 3.515988e-07 0.04814805 -0.12068960
## cg15861585 cg15861585 6.388047e-07 0.04814805 0.25150827
## cg06528150 cg06528150 7.848523e-07 0.04814805 0.04162110
## cg15155209 cg15155209 8.500193e-07 0.04814805 -0.18413729
## cg15015892 cg15015892 8.837434e-07 0.04814805 0.29120319
## cg02727104 cg02727104 9.491837e-07 0.04814805 0.13523762
## cg22197050 cg22197050 1.231075e-06 0.04814805 0.06811023
## cg24783211 cg24783211 1.309124e-06 0.04814805 0.22864438
##Setting a threshold of 0.05DB, 5e-6 Nominal P (scale to 0.60DB):
makeVolcano_nominal(Age.Sex.SV_Table$Nominal_P, Age.Sex.SV_Table$Delta_Beta, 0.05, 5e-6, "DNAm changes", 0.5) #at 5e-6: 9 Hypermethylated, 0 Hypomethylated## [1] "Hypermethylated: 23"
## [1] "Hypomethylated: 5"
## Warning: Removed 2 rows containing missing values (geom_point).
#What are these hits?
#First make an annotated table - load 450K manifest.
load("~/KoborLab/kobor_space/shared_coding_resource/Illumina_EPIC/EPIC_Annotation_Complete.RData")
head(EPIC_Annotation_Complete)## Name AddressA_ID
## cg07881041 cg07881041 0085713262
## cg18478105 cg18478105 0046761277
## cg23229610 cg23229610 0021717843
## cg03513874 cg03513874 0029622133
## cg09835024 cg09835024 0016745152
## cg05451842 cg05451842 0016681196
## AlleleA_ProbeSeq AddressB_ID
## cg07881041 CTACAAATACAACACCCTCAACCCATATTTCATATATTATCTCATTTAAC
## cg18478105 AAATAAATTTCACTCTCAAATCCCAATCTCATACAACAAAACAAAAACCA 0086644198
## cg23229610 ATAAAATTCTTTCCTTAAAAAACATTAAAACCAAAATAAACAAAAATTCC
## cg03513874 ACAATAAAATAATAAAATCCCATCACTACTTACCCTCCTTAAATAATATC
## cg09835024 AATAAACACCAACCCCAAACCAATCTCACTTTATTAAATTACAAAAATCA 0081631976
## cg05451842 CRTTCAAATACACTATAACCCRACTAAAAAAACCCCCAACAACCCAAAAC
## AlleleB_ProbeSeq
## cg07881041
## cg18478105 AAATAAATTTCGCTCTCAAATCCCAATCTCGTACGACGAAACGAAAACCG
## cg23229610
## cg03513874
## cg09835024 AATAAACGCCGACCCCGAACCGATCTCGCTTTATTAAATTACAAAAATCG
## cg05451842
## Infinium_Design_Type Next_Base Color_Channel
## cg07881041 II
## cg18478105 I C Grn
## cg23229610 II
## cg03513874 II
## cg09835024 I A Red
## cg05451842 II
## Forward_Sequence
## cg07881041 CTGCACGCCTACTGCAGGTGCAGCACCCTCAGCCCATGTTTCATGTATTATCTCATTTAA[CG]CATGTATCATCTCATTTAATGCATGCATTATCTCATTTAATTCTCACAACCCCTCAGGTG
## cg18478105 TCCCGTCTTACGGGATGGATTTCGCTCTCAGGTCCCAGTCTCGTGCGGCGGGGCGGGGAC[CG]CAGCCGGCTGGGCGGGGAAGCCCTGAGCCGGGGAAGTCACGTGGGGCGTGTCCGGAGGCG
## cg23229610 GTTTCTGGACAGTAAAATTCTTTCCTTGAAGGACATTAGGGCCAAAATGGGCAAGGATTC[CG]AGATTGGTACATCGAGCGTTATCTTCCAACTCTCTTTTCTAAATGGGCTCATTTAGTAAT
## cg03513874 ATTGTGCCCACCTTGCTGCTGACAGTTAAGCATCACTAAAGTAGGAAATAGGGTCCAAAC[CG]ACACTACTTAAGGAGGGCAAGTAGTGATGGGACCTCATCATCCCATTGCTATCATGGAGC
## cg09835024 AGCCCCGTCATAGGTGGGCGCCGACCCCGAGCCGATCTCGCTTTATTAAATTACAGAAAT[CG]GTATTCAAAAAAAAAAAAAAAAAAGGGCGGGGAGGACACTCCCTCTTCTCTGTTCCCACA
## cg05451842 CACAGCGTGGATGCCCCGATTTCCCAGGTCCCTCCGCAACCCTCAGTAGAACTCCCACCG[CG]CCCTGGGCTGCTGGGGGCCTCCCCAGCCGGGTCACAGTGCACCTGAACGCCCCGGTCCGT
## Genome_Build CHR MAPINFO
## cg07881041 37 19 5236016
## cg18478105 37 20 61847650
## cg23229610 37 1 6841125
## cg03513874 37 2 198303466
## cg09835024 37 X 24072640
## cg05451842 37 14 93581139
## SourceSeq Strand
## cg07881041 TGCAGGTGCAGCACCCTCAGCCCATGTTTCATGTATTATCTCATTTAACG R
## cg18478105 CGGTCCCCGCCCCGCCGCACGAGACTGGGACCTGAGAGCGAAATCCATCC R
## cg23229610 CGGAATCCTTGCCCATTTTGGCCCTAATGTCCTTCAAGGAAAGAATTTTA R
## cg03513874 CAATGGGATGATGAGGTCCCATCACTACTTGCCCTCCTTAAGTAGTGTCG F
## cg09835024 GGTGGGCGCCGACCCCGAGCCGATCTCGCTTTATTAAATTACAGAAATCG R
## cg05451842 CGCCCTGGGCTGCTGGGGGCCTCCCCAGCCGGGTCACAGTGCACCTGAAC F
## UCSC_RefGene_Name UCSC_RefGene_Accession
## cg07881041 PTPRS;PTPRS;PTPRS;PTPRS NM_130855;NM_002850;NM_130854;NM_130853
## cg18478105 YTHDF1 NM_017798
## cg23229610
## cg03513874
## cg09835024 EIF2S3 NM_001415
## cg05451842 ITPK1;ITPK1;ITPK1 NM_001142593;NM_014216;NM_001142594
## UCSC_RefGene_Group UCSC_CpG_Islands_Name
## cg07881041 Body;Body;Body;Body chr19:5237294-5237669
## cg18478105 TSS200 chr20:61846843-61848103
## cg23229610 chr1:6844313-6846366
## cg03513874 chr2:198299244-198299972
## cg09835024 TSS1500 chrX:24072558-24073135
## cg05451842 Body;Body;Body chr14:93581083-93582797
## Relation_to_UCSC_CpG_Island Phantom4_Enhancers Phantom5_Enhancers
## cg07881041 N_Shore
## cg18478105 Island
## cg23229610 N_Shelf
## cg03513874 S_Shelf
## cg09835024 Island
## cg05451842 Island
## DMR X450k_Enhancer HMM_Island Regulatory_Feature_Name
## cg07881041 NA
## cg18478105 NA 20:61317142-61318498 20:61846284-61847956
## cg23229610 NA
## cg03513874 NA
## cg09835024 NA X:24071907-24073667
## cg05451842 NA 14:92650663-92652544
## Regulatory_Feature_Group GencodeBasicV12_NAME
## cg07881041
## cg18478105 Promoter_Associated YTHDF1;YTHDF1
## cg23229610
## cg03513874
## cg09835024 Promoter_Associated EIF2S3
## cg05451842 ITPK1
## GencodeBasicV12_Accession GencodeBasicV12_Group
## cg07881041
## cg18478105 ENST00000370334.4;ENST00000370339.3 TSS200;TSS200
## cg23229610
## cg03513874
## cg09835024 ENST00000253039.4 TSS200
## cg05451842 ENST00000555495.1 5'UTR
## GencodeCompV12_NAME
## cg07881041
## cg18478105 YTHDF1;YTHDF1
## cg23229610
## cg03513874
## cg09835024 EIF2S3;EIF2S3;EIF2S3
## cg05451842 ITPK1
## GencodeCompV12_Accession
## cg07881041
## cg18478105 ENST00000370334.4;ENST00000370339.3
## cg23229610
## cg03513874
## cg09835024 ENST00000487075.1;ENST00000423068.1;ENST00000253039.4
## cg05451842 ENST00000555495.1
## GencodeCompV12_Group DNase_Hypersensitivity_NAME
## cg07881041
## cg18478105 TSS200;TSS200 chr20:61847520-61847755
## cg23229610
## cg03513874
## cg09835024 TSS1500;TSS1500;TSS200 chrX:24072600-24073395
## cg05451842 5'UTR chr14:93581080-93581375
## DNase_Hypersensitivity_Evidence_Count OpenChromatin_NAME
## cg07881041 NA
## cg18478105 3
## cg23229610 NA
## cg03513874 NA
## cg09835024 3
## cg05451842 3
## OpenChromatin_Evidence_Count TFBS_NAME TFBS_Evidence_Count
## cg07881041 NA NA
## cg18478105 NA NA
## cg23229610 NA NA
## cg03513874 NA NA
## cg09835024 NA NA
## cg05451842 NA NA
## Methyl27_Loci Methyl450_Loci Chromosome_36 Coordinate_36
## cg07881041 NA TRUE 19 5187016
## cg18478105 NA TRUE 20 61318095
## cg23229610 NA TRUE 1 6763712
## cg03513874 NA TRUE 2 198011711
## cg09835024 NA TRUE X 23982561
## cg05451842 NA TRUE 14 92650892
## SNP_ID SNP_DISTANCE SNP_MinorAlleleFrequency
## cg07881041 rs187313142 18 0.000200
## cg18478105 rs549944121 5 0.001797
## cg23229610 rs545824288;rs527255711 40;12 0.000200;0.001198
## cg03513874
## cg09835024
## cg05451842 rs550745821 22 0.000200
## Random_Loci X strand Probe_rs Probe_maf CpG_rs CpG_maf SBE_rs
## cg07881041 NA NA - <NA> NA <NA> NA <NA>
## cg18478105 NA NA - <NA> NA <NA> NA <NA>
## cg23229610 NA NA - <NA> NA <NA> NA <NA>
## cg03513874 NA NA + <NA> NA <NA> NA <NA>
## cg09835024 NA NA - <NA> NA <NA> NA <NA>
## cg05451842 NA NA + <NA> NA <NA> NA <NA>
## SBE_maf CH_450_XY CH_450_Aut CH_EPIC Cross_Hyb
## cg07881041 NA No No No No
## cg18478105 NA No No No No
## cg23229610 NA No No No No
## cg03513874 NA No No No No
## cg09835024 NA No No No No
## cg05451842 NA No No No No
Age.Sex.SV_Table.annotated = merge(Age.Sex.SV_Table, EPIC_Annotation_Complete[,c("Name", "CHR", "Strand", "UCSC_RefGene_Name", "UCSC_RefGene_Group")], by.x = "CpG", by.y = "Name", all = FALSE)
colnames(Age.Sex.SV_Table.annotated)[5:6] = c("Chromosome", "Coordinate")
Age.Sex.SV_Table.annotated <- Age.Sex.SV_Table.annotated[order(Age.Sex.SV_Table.annotated$Nominal_P),]
#Grabbing the Volcano hits:
LM_Age.Sex.SV_Hits <- Age.Sex.SV_Table.annotated[which(abs(Age.Sex.SV_Table.annotated$Delta_Beta)>0.05 & Age.Sex.SV_Table.annotated$Nominal_P<5e-6),]
#Let's order by Nominal_P:
LM_Age.Sex.SV_Hits = LM_Age.Sex.SV_Hits[order(LM_Age.Sex.SV_Hits$Nominal_P),]
rownames(LM_Age.Sex.SV_Hits) = c()
str(LM_Age.Sex.SV_Hits)## 'data.frame': 26 obs. of 8 variables:
## $ CpG : Factor w/ 425456 levels "cg00000029","cg00000108",..: 389441 301975 74991 256738 246988 244895 48814 345056 381395 88163 ...
## $ Nominal_P : num 9.56e-08 3.16e-07 3.52e-07 6.39e-07 8.50e-07 ...
## $ FDR : num 0.0407 0.0481 0.0481 0.0481 0.0481 ...
## $ Delta_Beta : num 0.0514 0.1523 -0.1207 0.2515 -0.1841 ...
## $ Chromosome : Factor w/ 25 levels "","1","10","11",..: 18 8 12 4 17 2 6 13 18 5 ...
## $ Coordinate : Factor w/ 3 levels "","F","R": 3 2 2 2 2 3 2 3 3 2 ...
## $ UCSC_RefGene_Name : Factor w/ 66070 levels "","A1BG","A1BG-AS1;A1BG",..: 32758 20278 44629 15936 53494 7376 1 30522 1 51815 ...
## $ UCSC_RefGene_Group: Factor w/ 8044 levels "","1stExon","1stExon;1stExon",..: 5441 5441 3719 5441 4133 5126 1 5441 1 3719 ...
head(LM_Age.Sex.SV_Hits)## CpG Nominal_P FDR Delta_Beta Chromosome Coordinate
## 1 cg25359907 9.561427e-08 0.04067967 0.05135459 4 R
## 2 cg19005438 3.161929e-07 0.04814805 0.15230396 15 F
## 3 cg04253011 3.515988e-07 0.04814805 -0.12068960 19 F
## 4 cg15861585 6.388047e-07 0.04814805 0.25150827 11 F
## 5 cg15155209 8.500193e-07 0.04814805 -0.18413729 3 F
## 6 cg15015892 8.837434e-07 0.04814805 0.29120319 1 R
## UCSC_RefGene_Name UCSC_RefGene_Group
## 1 LOC84740 TSS1500
## 2 FOXB1 TSS1500
## 3 PLEKHG2 Body
## 4 DRD4 TSS1500
## 5 SLC6A6;SLC6A6 Body;Body
## 6 C2CD4D;LOC100132111 Body;TSS200
Age-associated hits: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3482848/ Non-tissue specific hits: https://epigeneticsandchromatin.biomedcentral.com/articles/10.1186/s13072-018-0191-3
#Reorder row index.
rownames(Age.Sex.SV_Table.annotated) <- NULL
#Look at hits based on candidate genes.
head(Age.Sex.SV_Table.annotated[which(Age.Sex.SV_Table.annotated$UCSC_RefGene_Name == "ELOVL2"),]) #First ELOVL2 hit == row 278.## CpG Nominal_P FDR Delta_Beta Chromosome Coordinate
## 82 cg16867657 4.050478e-05 0.1909535 0.13208991 6 F
## 884 cg24724428 7.100348e-04 0.3174740 0.13746296 6 F
## 4921 cg21572722 5.900411e-03 0.4754659 0.06612837 6 F
## 9021 cg01799681 1.236828e-02 0.5416599 -0.07939168 6 F
## 104408 cg16323298 2.252035e-01 0.8520043 -0.02284845 6 F
## 110163 cg13562911 2.392363e-01 0.8579563 0.01397406 6 R
## UCSC_RefGene_Name UCSC_RefGene_Group
## 82 ELOVL2 TSS1500
## 884 ELOVL2 TSS1500
## 4921 ELOVL2 TSS1500
## 9021 ELOVL2 Body
## 104408 ELOVL2 TSS1500
## 110163 ELOVL2 Body
head(Age.Sex.SV_Table.annotated[which(Age.Sex.SV_Table.annotated$UCSC_RefGene_Name == "EDARADD"),]) #First EDARADD hit == row 111292.## CpG Nominal_P FDR Delta_Beta Chromosome Coordinate
## 70116 cg18964582 0.1429912 0.8056483 0.01454488 1 R
## UCSC_RefGene_Name UCSC_RefGene_Group
## 70116 EDARADD TSS1500
head(Age.Sex.SV_Table.annotated[which(Age.Sex.SV_Table.annotated$UCSC_RefGene_Name == "TOM1L1"),]) #First TOM1L1 hit == row 2140.## CpG Nominal_P FDR Delta_Beta Chromosome Coordinate
## 1195 cg25431220 0.001022328 0.3389639 0.02343953 17 F
## 5163 cg07081054 0.006257355 0.4796366 0.05648637 17 R
## 41557 cg03870845 0.077631573 0.7376456 0.01682909 17 F
## 60672 cg05265484 0.120857479 0.7869535 0.05997839 17 F
## 66315 cg10237252 0.133852023 0.7974528 0.09643585 17 R
## 148404 cg12240603 0.335573177 0.8936094 0.01685166 17 F
## UCSC_RefGene_Name UCSC_RefGene_Group
## 1195 TOM1L1 TSS200
## 5163 TOM1L1 Body
## 41557 TOM1L1 TSS200
## 60672 TOM1L1 TSS1500
## 66315 TOM1L1 TSS1500
## 148404 TOM1L1 TSS1500
head(Age.Sex.SV_Table.annotated[which(Age.Sex.SV_Table.annotated$UCSC_RefGene_Name == "NPTX2"),]) #First NPTX2 hit == row 8828.## CpG Nominal_P FDR Delta_Beta Chromosome Coordinate
## 6561 cg02368096 0.008382397 0.5050415 0.06860593 7 R
## 21042 cg13878520 0.034448045 0.6466652 0.03481473 7 R
## 25935 cg13314145 0.044192157 0.6732259 0.02981350 7 R
## 35336 cg08315202 0.063950142 0.7147905 0.04385453 7 R
## 40619 cg13585675 0.075444661 0.7334426 0.04452483 7 F
## 79584 cg13695954 0.165389642 0.8210634 0.01155458 7 F
## UCSC_RefGene_Name UCSC_RefGene_Group
## 6561 NPTX2 Body
## 21042 NPTX2 Body
## 25935 NPTX2 TSS1500
## 35336 NPTX2 TSS1500
## 40619 NPTX2 Body
## 79584 NPTX2 Body
Sex, SV (cell type) and chip as covariates.
#Load all the objects.
load("~/KoborLab/kobor_space/kendrix/macular_degeneration/data/AMD_SVA_M_values.funnorm.RData")
dim(AMD_pData) #44 samples.## [1] 44 22
dim(M_values.funnorm.filt) #425456 probes.## [1] 425456 44
AMD_pData$Disease_State <- as.factor(AMD_pData$Disease_State)
AMD_pData$Sex <- as.factor(AMD_pData$Sex)
AMD_pData$Row <- as.factor(AMD_pData$Row)
AMD_pData$Chip <- as.factor(AMD_pData$Chip)
AMD_pData$Age <- as.numeric(AMD_pData$Age)
str(AMD_pData)## 'data.frame': 44 obs. of 22 variables:
## $ Sample_Name : chr "Sample 1" "Sample 10" "Sample 11" "Sample 12" ...
## $ Disease_State: Factor w/ 2 levels "age-related macular degeneration",..: 2 2 2 1 1 1 2 1 1 1 ...
## $ Sex : Factor w/ 2 levels "F","M": 2 2 2 2 2 1 2 2 1 2 ...
## $ Age : num 61 74 70 76 79 89 66 70 83 76 ...
## $ Tissue : chr "pigmented layer of retina" "pigmented layer of retina" "pigmented layer of retina" "pigmented layer of retina" ...
## $ Row : Factor w/ 12 levels "R01C01","R01C02",..: 3 10 12 1 3 5 7 9 11 2 ...
## $ Chip : Factor w/ 4 levels "200723300084",..: 2 2 2 4 4 4 4 4 4 4 ...
## $ Basename : chr "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200723300089_R02C01" "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200723300089_R05C02" "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200723300089_R06C02" "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200770460039_R01C01" ...
## $ filenames : chr "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200723300089_R02C01" "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200723300089_R05C02" "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200723300089_R06C02" "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200770460039_R01C01" ...
## $ xMed : num 11.4 11.7 11.9 11.7 11.6 ...
## $ yMed : num 11.7 12 12.1 11.9 11.9 ...
## $ predictedSex : chr "M" "M" "M" "M" ...
## $ SV : num -0.1222 -0.011 -0.1074 -0.073 -0.0265 ...
## $ Epithelial : num 0.387 0.436 0.367 0.413 0.387 ...
## $ Fibroblast : num 0.326 0.316 0.298 0.24 0.312 ...
## $ B_Cell : num 0.0502 0.0454 0.0582 0.0528 0.0482 ...
## $ NK_Cell : num 0.065 0.0601 0.0816 0.0986 0.0743 ...
## $ CD4T : num 0.0478 0.0467 0.0545 0.0658 0.0544 ...
## $ CD8T : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Monocyte : num 0.0685 0.05 0.081 0.0666 0.0651 ...
## $ Neutrophil : num 0 0 0 0 0 ...
## $ Eosinophil : num 0.0555 0.0457 0.0597 0.0632 0.0597 ...
#Check order.
identical(rownames(AMD_pData), colnames(M_values.funnorm.filt)) #TRUE. ## [1] TRUE
#Sanity check - there should be no NAs or infinite numbers - which could be a result of logit transformation of 0 or 1 beta values.
all(complete.cases(M_values.funnorm.filt)) == "TRUE" #TRUE - meaning no NA or infinite numbers. ## [1] TRUE
library(pbapply) #Progress bar for apply functions.
#EWAS on Age - All samples + Sex + SV + Chip.
#LM: Need to use transformed M-values instead of beta values as it is more statistically sound.
Age.Sex.SV.chip_LM_pval <- pbsapply(1:nrow(M_values.funnorm.filt), function(CpG){
meta <- AMD_pData
meta$Mval <- M_values.funnorm.filt[CpG,]
mod_Age.Sex.SV.chip <- lm(Mval ~ Age + Sex + SV + Chip, data = meta) #Only Sex + SV + Chip as covariate.
coef(summary(mod_Age.Sex.SV.chip))[2,4]}) #Returns nominal p-value for Age for model at each CpG.
head(Age.Sex.SV.chip_LM_pval)
save(Age.Sex.SV.chip_LM_pval, file = "~/KoborLab/kobor_space/kendrix/macular_degeneration/data/Age.Sex.SV.chip_LM_pval.RData")load("~/KoborLab/kobor_space/kendrix/macular_degeneration/data/Age.Sex.SV.chip_LM_pval.RData")
#Inspect p-value distribution for model.
pvalue_dist_Age.Sex.SV.chip <- data.frame(CpG = rownames(M_values.funnorm.filt), Nominal_P = Age.Sex.SV.chip_LM_pval)
ggplot(pvalue_dist_Age.Sex.SV.chip, aes(Nominal_P)) +
geom_histogram(fill = "grey90", color = "black") +
theme_classic() + xlab("Nominal P Value") +
ylim(0, 20000) +
xlim(min(Age.Sex.SV.chip_LM_pval), max(Age.Sex.SV.chip_LM_pval))## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values (geom_bar).
#Not right-skewed. Distribution is a little even.
#Multiple test correction with FDR.
M_values.funnorm.filt <- as.data.frame(M_values.funnorm.filt)
Multi_test_corr_relaxed <- p.adjust(Age.Sex.SV.chip_LM_pval, method = "fdr", n = length(Age.Sex.SV.chip_LM_pval))
#Looking at FDR thresholds for hits:
dim(as.data.frame(M_values.funnorm.filt)[which(Multi_test_corr_relaxed <= 0.05),]) #20 at 0.05.## [1] 0 44
dim(as.data.frame(M_values.funnorm.filt)[which(Multi_test_corr_relaxed <= 0.1),]) #49 at 0.1.## [1] 0 44
dim(as.data.frame(M_values.funnorm.filt)[which(Multi_test_corr_relaxed <= 0.2),]) #143 at 0.2.## [1] 24 44
#Looking at top hits by nominal P:
pvalue_dist_Age.Sex.SV.chip <- pvalue_dist_Age.Sex.SV.chip[order(pvalue_dist_Age.Sex.SV.chip$Nominal_P),]
head(pvalue_dist_Age.Sex.SV.chip)## CpG Nominal_P
## 75729 cg15155209 5.422114e-07
## 201801 cg21132564 7.437805e-07
## 27468 cg24155129 8.366901e-07
## 296800 cg27375012 9.669248e-07
## 100675 cg25359907 1.527042e-06
## 296713 cg06968164 2.590083e-06
#Load 450K annotation data.
load("~/KoborLab/kobor_space/shared_coding_resource/Illumina_EPIC/EPIC_Annotation_Complete.RData")
head(EPIC_Annotation_Complete)## Name AddressA_ID
## cg07881041 cg07881041 0085713262
## cg18478105 cg18478105 0046761277
## cg23229610 cg23229610 0021717843
## cg03513874 cg03513874 0029622133
## cg09835024 cg09835024 0016745152
## cg05451842 cg05451842 0016681196
## AlleleA_ProbeSeq AddressB_ID
## cg07881041 CTACAAATACAACACCCTCAACCCATATTTCATATATTATCTCATTTAAC
## cg18478105 AAATAAATTTCACTCTCAAATCCCAATCTCATACAACAAAACAAAAACCA 0086644198
## cg23229610 ATAAAATTCTTTCCTTAAAAAACATTAAAACCAAAATAAACAAAAATTCC
## cg03513874 ACAATAAAATAATAAAATCCCATCACTACTTACCCTCCTTAAATAATATC
## cg09835024 AATAAACACCAACCCCAAACCAATCTCACTTTATTAAATTACAAAAATCA 0081631976
## cg05451842 CRTTCAAATACACTATAACCCRACTAAAAAAACCCCCAACAACCCAAAAC
## AlleleB_ProbeSeq
## cg07881041
## cg18478105 AAATAAATTTCGCTCTCAAATCCCAATCTCGTACGACGAAACGAAAACCG
## cg23229610
## cg03513874
## cg09835024 AATAAACGCCGACCCCGAACCGATCTCGCTTTATTAAATTACAAAAATCG
## cg05451842
## Infinium_Design_Type Next_Base Color_Channel
## cg07881041 II
## cg18478105 I C Grn
## cg23229610 II
## cg03513874 II
## cg09835024 I A Red
## cg05451842 II
## Forward_Sequence
## cg07881041 CTGCACGCCTACTGCAGGTGCAGCACCCTCAGCCCATGTTTCATGTATTATCTCATTTAA[CG]CATGTATCATCTCATTTAATGCATGCATTATCTCATTTAATTCTCACAACCCCTCAGGTG
## cg18478105 TCCCGTCTTACGGGATGGATTTCGCTCTCAGGTCCCAGTCTCGTGCGGCGGGGCGGGGAC[CG]CAGCCGGCTGGGCGGGGAAGCCCTGAGCCGGGGAAGTCACGTGGGGCGTGTCCGGAGGCG
## cg23229610 GTTTCTGGACAGTAAAATTCTTTCCTTGAAGGACATTAGGGCCAAAATGGGCAAGGATTC[CG]AGATTGGTACATCGAGCGTTATCTTCCAACTCTCTTTTCTAAATGGGCTCATTTAGTAAT
## cg03513874 ATTGTGCCCACCTTGCTGCTGACAGTTAAGCATCACTAAAGTAGGAAATAGGGTCCAAAC[CG]ACACTACTTAAGGAGGGCAAGTAGTGATGGGACCTCATCATCCCATTGCTATCATGGAGC
## cg09835024 AGCCCCGTCATAGGTGGGCGCCGACCCCGAGCCGATCTCGCTTTATTAAATTACAGAAAT[CG]GTATTCAAAAAAAAAAAAAAAAAAGGGCGGGGAGGACACTCCCTCTTCTCTGTTCCCACA
## cg05451842 CACAGCGTGGATGCCCCGATTTCCCAGGTCCCTCCGCAACCCTCAGTAGAACTCCCACCG[CG]CCCTGGGCTGCTGGGGGCCTCCCCAGCCGGGTCACAGTGCACCTGAACGCCCCGGTCCGT
## Genome_Build CHR MAPINFO
## cg07881041 37 19 5236016
## cg18478105 37 20 61847650
## cg23229610 37 1 6841125
## cg03513874 37 2 198303466
## cg09835024 37 X 24072640
## cg05451842 37 14 93581139
## SourceSeq Strand
## cg07881041 TGCAGGTGCAGCACCCTCAGCCCATGTTTCATGTATTATCTCATTTAACG R
## cg18478105 CGGTCCCCGCCCCGCCGCACGAGACTGGGACCTGAGAGCGAAATCCATCC R
## cg23229610 CGGAATCCTTGCCCATTTTGGCCCTAATGTCCTTCAAGGAAAGAATTTTA R
## cg03513874 CAATGGGATGATGAGGTCCCATCACTACTTGCCCTCCTTAAGTAGTGTCG F
## cg09835024 GGTGGGCGCCGACCCCGAGCCGATCTCGCTTTATTAAATTACAGAAATCG R
## cg05451842 CGCCCTGGGCTGCTGGGGGCCTCCCCAGCCGGGTCACAGTGCACCTGAAC F
## UCSC_RefGene_Name UCSC_RefGene_Accession
## cg07881041 PTPRS;PTPRS;PTPRS;PTPRS NM_130855;NM_002850;NM_130854;NM_130853
## cg18478105 YTHDF1 NM_017798
## cg23229610
## cg03513874
## cg09835024 EIF2S3 NM_001415
## cg05451842 ITPK1;ITPK1;ITPK1 NM_001142593;NM_014216;NM_001142594
## UCSC_RefGene_Group UCSC_CpG_Islands_Name
## cg07881041 Body;Body;Body;Body chr19:5237294-5237669
## cg18478105 TSS200 chr20:61846843-61848103
## cg23229610 chr1:6844313-6846366
## cg03513874 chr2:198299244-198299972
## cg09835024 TSS1500 chrX:24072558-24073135
## cg05451842 Body;Body;Body chr14:93581083-93582797
## Relation_to_UCSC_CpG_Island Phantom4_Enhancers Phantom5_Enhancers
## cg07881041 N_Shore
## cg18478105 Island
## cg23229610 N_Shelf
## cg03513874 S_Shelf
## cg09835024 Island
## cg05451842 Island
## DMR X450k_Enhancer HMM_Island Regulatory_Feature_Name
## cg07881041 NA
## cg18478105 NA 20:61317142-61318498 20:61846284-61847956
## cg23229610 NA
## cg03513874 NA
## cg09835024 NA X:24071907-24073667
## cg05451842 NA 14:92650663-92652544
## Regulatory_Feature_Group GencodeBasicV12_NAME
## cg07881041
## cg18478105 Promoter_Associated YTHDF1;YTHDF1
## cg23229610
## cg03513874
## cg09835024 Promoter_Associated EIF2S3
## cg05451842 ITPK1
## GencodeBasicV12_Accession GencodeBasicV12_Group
## cg07881041
## cg18478105 ENST00000370334.4;ENST00000370339.3 TSS200;TSS200
## cg23229610
## cg03513874
## cg09835024 ENST00000253039.4 TSS200
## cg05451842 ENST00000555495.1 5'UTR
## GencodeCompV12_NAME
## cg07881041
## cg18478105 YTHDF1;YTHDF1
## cg23229610
## cg03513874
## cg09835024 EIF2S3;EIF2S3;EIF2S3
## cg05451842 ITPK1
## GencodeCompV12_Accession
## cg07881041
## cg18478105 ENST00000370334.4;ENST00000370339.3
## cg23229610
## cg03513874
## cg09835024 ENST00000487075.1;ENST00000423068.1;ENST00000253039.4
## cg05451842 ENST00000555495.1
## GencodeCompV12_Group DNase_Hypersensitivity_NAME
## cg07881041
## cg18478105 TSS200;TSS200 chr20:61847520-61847755
## cg23229610
## cg03513874
## cg09835024 TSS1500;TSS1500;TSS200 chrX:24072600-24073395
## cg05451842 5'UTR chr14:93581080-93581375
## DNase_Hypersensitivity_Evidence_Count OpenChromatin_NAME
## cg07881041 NA
## cg18478105 3
## cg23229610 NA
## cg03513874 NA
## cg09835024 3
## cg05451842 3
## OpenChromatin_Evidence_Count TFBS_NAME TFBS_Evidence_Count
## cg07881041 NA NA
## cg18478105 NA NA
## cg23229610 NA NA
## cg03513874 NA NA
## cg09835024 NA NA
## cg05451842 NA NA
## Methyl27_Loci Methyl450_Loci Chromosome_36 Coordinate_36
## cg07881041 NA TRUE 19 5187016
## cg18478105 NA TRUE 20 61318095
## cg23229610 NA TRUE 1 6763712
## cg03513874 NA TRUE 2 198011711
## cg09835024 NA TRUE X 23982561
## cg05451842 NA TRUE 14 92650892
## SNP_ID SNP_DISTANCE SNP_MinorAlleleFrequency
## cg07881041 rs187313142 18 0.000200
## cg18478105 rs549944121 5 0.001797
## cg23229610 rs545824288;rs527255711 40;12 0.000200;0.001198
## cg03513874
## cg09835024
## cg05451842 rs550745821 22 0.000200
## Random_Loci X strand Probe_rs Probe_maf CpG_rs CpG_maf SBE_rs
## cg07881041 NA NA - <NA> NA <NA> NA <NA>
## cg18478105 NA NA - <NA> NA <NA> NA <NA>
## cg23229610 NA NA - <NA> NA <NA> NA <NA>
## cg03513874 NA NA + <NA> NA <NA> NA <NA>
## cg09835024 NA NA - <NA> NA <NA> NA <NA>
## cg05451842 NA NA + <NA> NA <NA> NA <NA>
## SBE_maf CH_450_XY CH_450_Aut CH_EPIC Cross_Hyb
## cg07881041 NA No No No No
## cg18478105 NA No No No No
## cg23229610 NA No No No No
## cg03513874 NA No No No No
## cg09835024 NA No No No No
## cg05451842 NA No No No No
dim(hits_CpGs <- pvalue_dist_Age.Sex.SV.chip[which(pvalue_dist_Age.Sex.SV.chip$Nominal_P < 1e-6),]) #8 hits.## [1] 4 2
hits <- EPIC_Annotation_Complete[which(EPIC_Annotation_Complete$Name%in%hits_CpGs$CpG),]
hits$UCSC_RefGene_Name## [1] SLC25A30 ANK1 SLC6A6;SLC6A6
## 66070 Levels: A1BG A1BG-AS1;A1BG A1BG-AS1;A1BG;ZNF497;ZNF497 ... ZZZ3;ZZZ3;ZZZ3
#Delta beta.
#Using Maggie's code for deltabeta:
deltabeta <- function(df, mainvar, covar1 = NULL, covar2 = NULL, covar3 = NULL, covar4 = NULL, covar5 = NULL) {
# Calculating delta beta of the main variable of interest (mainvar), with up to 5 possible covariates (covar)
# mainvar should be a vector of continuous variable
# all covars should also be vectors
# df = dataframe or matrix of beta values
# output is a vector of delta beta values
sd=sd(mainvar)
qt <-
range <- max(mainvar, na.rm = T) - min(mainvar, na.rm = T)
dB <- vector(mode = "numeric", length = nrow(df))
names(dB) <- rownames(df)
for (i in 1:nrow(df)) {
beta <- df[i, ]
if (is.null(covar1)) {
mod <- lm(beta ~ mainvar)
} else if (is.null(covar2)) {
mod <- lm(beta ~ mainvar + covar1)
} else if (is.null(covar3)) {
mod <- lm(beta ~ mainvar + covar1 + covar2)
} else if (is.null(covar4)) {
mod <- lm(beta ~ mainvar + covar1 + covar2 + covar3)
} else if (is.null(covar5)) {
mod <- lm(beta ~ mainvar + covar1 + covar2 + covar3 + covar4)
} else {
mod <- lm(beta ~ mainvar + covar1 + covar2 + covar3 + covar4 + covar5)
}
slope <- mod$coefficients[2]
dB[i] <- as.numeric(slope*range)
}
dB
}
betas.funnorm.filt <- m2beta(M_values.funnorm.filt)
delta_beta_Age.Sex.SV.chip_fixed <- deltabeta(as.matrix(betas.funnorm.filt), AMD_pData$Age, covar1 = AMD_pData$Sex, covar2 = AMD_pData$SV, covar3 = AMD_pData$Chip)
length(delta_beta_Age.Sex.SV.chip_fixed)
summary(delta_beta_Age.Sex.SV.chip_fixed)
save(delta_beta_Age.Sex.SV.chip_fixed, file = "~/KoborLab/kobor_space/kendrix/macular_degeneration/data/DB_Age.Sex.SV.chip_fixed.RData")##3. Linear Model: Volcano Plot
#Volcano to examine hits (for DB, see below chunks):
load("~/KoborLab/kobor_space/kendrix/macular_degeneration/data/DB_Age.Sex.SV.chip_fixed.RData")
#Call Volcano (Nominal p Version, modified from Rachel's code):
source("/home/BCRICWH.LAN/dlin/KoborLab/kobor_space/cake/home/dlin/Volcano_DL_Nominal.R")
#After running the last 2 chunks, make a summary table with CpG, Nominal_P, FDR, and Delta Beta.
Age.Sex.SV.chip_Table <- data.frame(rownames(M_values.funnorm.filt), Age.Sex.SV.chip_LM_pval, Multi_test_corr_relaxed, delta_beta_Age.Sex.SV.chip_fixed)
colnames(Age.Sex.SV.chip_Table) = c("CpG", "Nominal_P", "FDR", "Delta_Beta")
identical(as.character(rownames(Age.Sex.SV.chip_Table)), as.character(Age.Sex.SV.chip_Table$CpG)) #TRUE.## [1] TRUE
#Looking at top hits quickly without considering DB:
head(Age.Sex.SV.chip_Table[order(Age.Sex.SV.chip_Table$Nominal_P),],10)## CpG Nominal_P FDR Delta_Beta
## cg15155209 cg15155209 5.422114e-07 0.1028460 -0.20354698
## cg21132564 cg21132564 7.437805e-07 0.1028460 0.08116935
## cg24155129 cg24155129 8.366901e-07 0.1028460 0.17814272
## cg27375012 cg27375012 9.669248e-07 0.1028460 0.02885347
## cg25359907 cg25359907 1.527042e-06 0.1299378 0.04918882
## cg06968164 cg06968164 2.590083e-06 0.1449706 -0.16283149
## cg22197050 cg22197050 2.707523e-06 0.1449706 0.07129775
## cg19005438 cg19005438 2.725933e-06 0.1449706 0.15603705
## cg06173889 cg06173889 3.857674e-06 0.1513159 0.23571242
## cg05800683 cg05800683 3.877357e-06 0.1513159 0.01597645
##Setting a threshold of 0.05DB, 5e-6 Nominal P (scale to 0.60DB):
makeVolcano_nominal(Age.Sex.SV.chip_Table$Nominal_P, Age.Sex.SV.chip_Table$Delta_Beta, 0.05, 3e-6, "DNAm changes", 0.5) #at 5e-6: 9 Hypermethylated, 0 Hypomethylated## [1] "Hypermethylated: 4"
## [1] "Hypomethylated: 2"
## Warning: Removed 1 rows containing missing values (geom_point).
#What are these hits?
#First make an annotated table - load 450K manifest.
load("~/KoborLab/kobor_space/shared_coding_resource/Illumina_EPIC/EPIC_Annotation_Complete.RData")
head(EPIC_Annotation_Complete)## Name AddressA_ID
## cg07881041 cg07881041 0085713262
## cg18478105 cg18478105 0046761277
## cg23229610 cg23229610 0021717843
## cg03513874 cg03513874 0029622133
## cg09835024 cg09835024 0016745152
## cg05451842 cg05451842 0016681196
## AlleleA_ProbeSeq AddressB_ID
## cg07881041 CTACAAATACAACACCCTCAACCCATATTTCATATATTATCTCATTTAAC
## cg18478105 AAATAAATTTCACTCTCAAATCCCAATCTCATACAACAAAACAAAAACCA 0086644198
## cg23229610 ATAAAATTCTTTCCTTAAAAAACATTAAAACCAAAATAAACAAAAATTCC
## cg03513874 ACAATAAAATAATAAAATCCCATCACTACTTACCCTCCTTAAATAATATC
## cg09835024 AATAAACACCAACCCCAAACCAATCTCACTTTATTAAATTACAAAAATCA 0081631976
## cg05451842 CRTTCAAATACACTATAACCCRACTAAAAAAACCCCCAACAACCCAAAAC
## AlleleB_ProbeSeq
## cg07881041
## cg18478105 AAATAAATTTCGCTCTCAAATCCCAATCTCGTACGACGAAACGAAAACCG
## cg23229610
## cg03513874
## cg09835024 AATAAACGCCGACCCCGAACCGATCTCGCTTTATTAAATTACAAAAATCG
## cg05451842
## Infinium_Design_Type Next_Base Color_Channel
## cg07881041 II
## cg18478105 I C Grn
## cg23229610 II
## cg03513874 II
## cg09835024 I A Red
## cg05451842 II
## Forward_Sequence
## cg07881041 CTGCACGCCTACTGCAGGTGCAGCACCCTCAGCCCATGTTTCATGTATTATCTCATTTAA[CG]CATGTATCATCTCATTTAATGCATGCATTATCTCATTTAATTCTCACAACCCCTCAGGTG
## cg18478105 TCCCGTCTTACGGGATGGATTTCGCTCTCAGGTCCCAGTCTCGTGCGGCGGGGCGGGGAC[CG]CAGCCGGCTGGGCGGGGAAGCCCTGAGCCGGGGAAGTCACGTGGGGCGTGTCCGGAGGCG
## cg23229610 GTTTCTGGACAGTAAAATTCTTTCCTTGAAGGACATTAGGGCCAAAATGGGCAAGGATTC[CG]AGATTGGTACATCGAGCGTTATCTTCCAACTCTCTTTTCTAAATGGGCTCATTTAGTAAT
## cg03513874 ATTGTGCCCACCTTGCTGCTGACAGTTAAGCATCACTAAAGTAGGAAATAGGGTCCAAAC[CG]ACACTACTTAAGGAGGGCAAGTAGTGATGGGACCTCATCATCCCATTGCTATCATGGAGC
## cg09835024 AGCCCCGTCATAGGTGGGCGCCGACCCCGAGCCGATCTCGCTTTATTAAATTACAGAAAT[CG]GTATTCAAAAAAAAAAAAAAAAAAGGGCGGGGAGGACACTCCCTCTTCTCTGTTCCCACA
## cg05451842 CACAGCGTGGATGCCCCGATTTCCCAGGTCCCTCCGCAACCCTCAGTAGAACTCCCACCG[CG]CCCTGGGCTGCTGGGGGCCTCCCCAGCCGGGTCACAGTGCACCTGAACGCCCCGGTCCGT
## Genome_Build CHR MAPINFO
## cg07881041 37 19 5236016
## cg18478105 37 20 61847650
## cg23229610 37 1 6841125
## cg03513874 37 2 198303466
## cg09835024 37 X 24072640
## cg05451842 37 14 93581139
## SourceSeq Strand
## cg07881041 TGCAGGTGCAGCACCCTCAGCCCATGTTTCATGTATTATCTCATTTAACG R
## cg18478105 CGGTCCCCGCCCCGCCGCACGAGACTGGGACCTGAGAGCGAAATCCATCC R
## cg23229610 CGGAATCCTTGCCCATTTTGGCCCTAATGTCCTTCAAGGAAAGAATTTTA R
## cg03513874 CAATGGGATGATGAGGTCCCATCACTACTTGCCCTCCTTAAGTAGTGTCG F
## cg09835024 GGTGGGCGCCGACCCCGAGCCGATCTCGCTTTATTAAATTACAGAAATCG R
## cg05451842 CGCCCTGGGCTGCTGGGGGCCTCCCCAGCCGGGTCACAGTGCACCTGAAC F
## UCSC_RefGene_Name UCSC_RefGene_Accession
## cg07881041 PTPRS;PTPRS;PTPRS;PTPRS NM_130855;NM_002850;NM_130854;NM_130853
## cg18478105 YTHDF1 NM_017798
## cg23229610
## cg03513874
## cg09835024 EIF2S3 NM_001415
## cg05451842 ITPK1;ITPK1;ITPK1 NM_001142593;NM_014216;NM_001142594
## UCSC_RefGene_Group UCSC_CpG_Islands_Name
## cg07881041 Body;Body;Body;Body chr19:5237294-5237669
## cg18478105 TSS200 chr20:61846843-61848103
## cg23229610 chr1:6844313-6846366
## cg03513874 chr2:198299244-198299972
## cg09835024 TSS1500 chrX:24072558-24073135
## cg05451842 Body;Body;Body chr14:93581083-93582797
## Relation_to_UCSC_CpG_Island Phantom4_Enhancers Phantom5_Enhancers
## cg07881041 N_Shore
## cg18478105 Island
## cg23229610 N_Shelf
## cg03513874 S_Shelf
## cg09835024 Island
## cg05451842 Island
## DMR X450k_Enhancer HMM_Island Regulatory_Feature_Name
## cg07881041 NA
## cg18478105 NA 20:61317142-61318498 20:61846284-61847956
## cg23229610 NA
## cg03513874 NA
## cg09835024 NA X:24071907-24073667
## cg05451842 NA 14:92650663-92652544
## Regulatory_Feature_Group GencodeBasicV12_NAME
## cg07881041
## cg18478105 Promoter_Associated YTHDF1;YTHDF1
## cg23229610
## cg03513874
## cg09835024 Promoter_Associated EIF2S3
## cg05451842 ITPK1
## GencodeBasicV12_Accession GencodeBasicV12_Group
## cg07881041
## cg18478105 ENST00000370334.4;ENST00000370339.3 TSS200;TSS200
## cg23229610
## cg03513874
## cg09835024 ENST00000253039.4 TSS200
## cg05451842 ENST00000555495.1 5'UTR
## GencodeCompV12_NAME
## cg07881041
## cg18478105 YTHDF1;YTHDF1
## cg23229610
## cg03513874
## cg09835024 EIF2S3;EIF2S3;EIF2S3
## cg05451842 ITPK1
## GencodeCompV12_Accession
## cg07881041
## cg18478105 ENST00000370334.4;ENST00000370339.3
## cg23229610
## cg03513874
## cg09835024 ENST00000487075.1;ENST00000423068.1;ENST00000253039.4
## cg05451842 ENST00000555495.1
## GencodeCompV12_Group DNase_Hypersensitivity_NAME
## cg07881041
## cg18478105 TSS200;TSS200 chr20:61847520-61847755
## cg23229610
## cg03513874
## cg09835024 TSS1500;TSS1500;TSS200 chrX:24072600-24073395
## cg05451842 5'UTR chr14:93581080-93581375
## DNase_Hypersensitivity_Evidence_Count OpenChromatin_NAME
## cg07881041 NA
## cg18478105 3
## cg23229610 NA
## cg03513874 NA
## cg09835024 3
## cg05451842 3
## OpenChromatin_Evidence_Count TFBS_NAME TFBS_Evidence_Count
## cg07881041 NA NA
## cg18478105 NA NA
## cg23229610 NA NA
## cg03513874 NA NA
## cg09835024 NA NA
## cg05451842 NA NA
## Methyl27_Loci Methyl450_Loci Chromosome_36 Coordinate_36
## cg07881041 NA TRUE 19 5187016
## cg18478105 NA TRUE 20 61318095
## cg23229610 NA TRUE 1 6763712
## cg03513874 NA TRUE 2 198011711
## cg09835024 NA TRUE X 23982561
## cg05451842 NA TRUE 14 92650892
## SNP_ID SNP_DISTANCE SNP_MinorAlleleFrequency
## cg07881041 rs187313142 18 0.000200
## cg18478105 rs549944121 5 0.001797
## cg23229610 rs545824288;rs527255711 40;12 0.000200;0.001198
## cg03513874
## cg09835024
## cg05451842 rs550745821 22 0.000200
## Random_Loci X strand Probe_rs Probe_maf CpG_rs CpG_maf SBE_rs
## cg07881041 NA NA - <NA> NA <NA> NA <NA>
## cg18478105 NA NA - <NA> NA <NA> NA <NA>
## cg23229610 NA NA - <NA> NA <NA> NA <NA>
## cg03513874 NA NA + <NA> NA <NA> NA <NA>
## cg09835024 NA NA - <NA> NA <NA> NA <NA>
## cg05451842 NA NA + <NA> NA <NA> NA <NA>
## SBE_maf CH_450_XY CH_450_Aut CH_EPIC Cross_Hyb
## cg07881041 NA No No No No
## cg18478105 NA No No No No
## cg23229610 NA No No No No
## cg03513874 NA No No No No
## cg09835024 NA No No No No
## cg05451842 NA No No No No
Age.Sex.SV.chip_Table.annotated = merge(Age.Sex.SV.chip_Table, EPIC_Annotation_Complete[,c("Name", "CHR", "Strand", "UCSC_RefGene_Name", "UCSC_RefGene_Group")], by.x = "CpG", by.y = "Name", all = FALSE)
colnames(Age.Sex.SV.chip_Table.annotated)[5:6] = c("Chromosome", "Coordinate")
Age.Sex.SV.chip_Table.annotated <- Age.Sex.SV.chip_Table.annotated[order(Age.Sex.SV.chip_Table.annotated$Nominal_P),]
#Grabbing the Volcano hits:
LM_Age.Sex.SV.chip_Hits <- Age.Sex.SV.chip_Table.annotated[which(abs(Age.Sex.SV.chip_Table.annotated$Delta_Beta)>0.05 & Age.Sex.SV.chip_Table.annotated$Nominal_P<4e-6),]
#Let's order by Nominal_P:
LM_Age.Sex.SV.chip_Hits = LM_Age.Sex.SV.chip_Hits[order(LM_Age.Sex.SV.chip_Hits$Nominal_P),]
rownames(LM_Age.Sex.SV.chip_Hits) = c()
str(LM_Age.Sex.SV.chip_Hits)## 'data.frame': 6 obs. of 8 variables:
## $ CpG : Factor w/ 425456 levels "cg00000029","cg00000108",..: 246988 330979 119392 345056 301975 106386
## $ Nominal_P : num 5.42e-07 7.44e-07 2.59e-06 2.71e-06 2.73e-06 ...
## $ FDR : num 0.103 0.103 0.145 0.145 0.145 ...
## $ Delta_Beta : num -0.2035 0.0812 -0.1628 0.0713 0.156 ...
## $ Chromosome : Factor w/ 25 levels "","1","10","11",..: 17 22 6 13 8 13
## $ Coordinate : Factor w/ 3 levels "","F","R": 2 2 2 3 2 3
## $ UCSC_RefGene_Name : Factor w/ 66070 levels "","A1BG","A1BG-AS1;A1BG",..: 53494 2154 1 30522 20278 54913
## $ UCSC_RefGene_Group: Factor w/ 8044 levels "","1stExon","1stExon;1stExon",..: 4133 3719 1 5441 5441 1209
head(LM_Age.Sex.SV.chip_Hits)## CpG Nominal_P FDR Delta_Beta Chromosome Coordinate
## 1 cg15155209 5.422114e-07 0.1028460 -0.20354698 3 F
## 2 cg21132564 7.437805e-07 0.1028460 0.08116935 8 F
## 3 cg06968164 2.590083e-06 0.1449706 -0.16283149 13 F
## 4 cg22197050 2.707523e-06 0.1449706 0.07129775 2 R
## 5 cg19005438 2.725933e-06 0.1449706 0.15603705 15 F
## 6 cg06173889 3.857674e-06 0.1513159 0.23571242 2 R
## UCSC_RefGene_Name UCSC_RefGene_Group
## 1 SLC6A6;SLC6A6 Body;Body
## 2 ANK1 Body
## 3
## 4 LOC100132215 TSS1500
## 5 FOXB1 TSS1500
## 6 SOX11;SOX11 3'UTR;1stExon
Age-associated hits: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3482848/ Non-tissue specific hits: https://epigeneticsandchromatin.biomedcentral.com/articles/10.1186/s13072-018-0191-3
#Reorder row index.
rownames(Age.Sex.SV.chip_Table.annotated) <- NULL
#Look at hits based on candidate genes.
head(Age.Sex.SV.chip_Table.annotated[which(Age.Sex.SV.chip_Table.annotated$UCSC_RefGene_Name == "ELOVL2"),]) #First ELOVL2 hit == row 278.## CpG Nominal_P FDR Delta_Beta Chromosome Coordinate
## 622 cg16867657 0.0009175385 0.5735714 0.100948234 6 F
## 7374 cg24724428 0.0136224422 0.7290164 0.096426750 6 F
## 20216 cg21572722 0.0415317908 0.8107440 0.050285162 6 F
## 28088 cg01799681 0.0599461434 0.8413430 -0.065848888 6 F
## 56933 cg16323298 0.1298762295 0.9010245 -0.031395803 6 F
## 174463 cg25151806 0.4262825709 0.9660973 0.008674443 6 R
## UCSC_RefGene_Name UCSC_RefGene_Group
## 622 ELOVL2 TSS1500
## 7374 ELOVL2 TSS1500
## 20216 ELOVL2 TSS1500
## 28088 ELOVL2 Body
## 56933 ELOVL2 TSS1500
## 174463 ELOVL2 TSS1500
head(Age.Sex.SV.chip_Table.annotated[which(Age.Sex.SV.chip_Table.annotated$UCSC_RefGene_Name == "EDARADD"),]) #First EDARADD hit == row 111292.## CpG Nominal_P FDR Delta_Beta Chromosome Coordinate
## 45040 cg18964582 0.1003107 0.8790123 0.01767369 1 R
## UCSC_RefGene_Name UCSC_RefGene_Group
## 45040 EDARADD TSS1500
head(Age.Sex.SV.chip_Table.annotated[which(Age.Sex.SV.chip_Table.annotated$UCSC_RefGene_Name == "TOM1L1"),]) #First TOM1L1 hit == row 2140.## CpG Nominal_P FDR Delta_Beta Chromosome Coordinate
## 3425 cg07081054 0.005911721 0.6785229 0.06236799 17 R
## 7211 cg25431220 0.013323336 0.7282549 0.01828383 17 F
## 19934 cg05265484 0.040972118 0.8106590 0.08024987 17 F
## 35720 cg10237252 0.078017910 0.8608219 0.11466416 17 R
## 135225 cg12240603 0.326746589 0.9552553 0.02000998 17 F
## 157845 cg13913085 0.383748920 0.9612396 0.04980854 17 F
## UCSC_RefGene_Name UCSC_RefGene_Group
## 3425 TOM1L1 Body
## 7211 TOM1L1 TSS200
## 19934 TOM1L1 TSS1500
## 35720 TOM1L1 TSS1500
## 135225 TOM1L1 TSS1500
## 157845 TOM1L1 Body
head(Age.Sex.SV.chip_Table.annotated[which(Age.Sex.SV.chip_Table.annotated$UCSC_RefGene_Name == "NPTX2"),]) #First NPTX2 hit == row 8828.## CpG Nominal_P FDR Delta_Beta Chromosome Coordinate
## 12311 cg13585675 0.02404082 0.7713448 0.05686261 7 F
## 19405 cg02368096 0.03975083 0.8082895 0.05827515 7 R
## 35988 cg05168977 0.07864802 0.8616515 -0.05100356 7 F
## 52267 cg13878520 0.11817578 0.8921699 0.02835619 7 R
## 54246 cg13695954 0.12309024 0.8957100 0.01411725 7 F
## 67178 cg13314145 0.15524681 0.9128074 0.02251316 7 R
## UCSC_RefGene_Name UCSC_RefGene_Group
## 12311 NPTX2 Body
## 19405 NPTX2 Body
## 35988 NPTX2 1stExon
## 52267 NPTX2 Body
## 54246 NPTX2 Body
## 67178 NPTX2 TSS1500
AMD samples only with sex, SV (cell type) and chip as covariates
#Load all the objects.
load("~/KoborLab/kobor_space/kendrix/macular_degeneration/data/AMD_SVA_M_values.funnorm.RData")
dim(AMD_pData) #44 samples.## [1] 44 22
dim(M_values.funnorm.filt) #425456 probes.## [1] 425456 44
AMD_pData$Disease_State <- as.factor(AMD_pData$Disease_State)
AMD_pData$Sex <- as.factor(AMD_pData$Sex)
AMD_pData$Row <- as.factor(AMD_pData$Row)
AMD_pData$Chip <- as.factor(AMD_pData$Chip)
AMD_pData$Age <- as.numeric(AMD_pData$Age)
str(AMD_pData)## 'data.frame': 44 obs. of 22 variables:
## $ Sample_Name : chr "Sample 1" "Sample 10" "Sample 11" "Sample 12" ...
## $ Disease_State: Factor w/ 2 levels "age-related macular degeneration",..: 2 2 2 1 1 1 2 1 1 1 ...
## $ Sex : Factor w/ 2 levels "F","M": 2 2 2 2 2 1 2 2 1 2 ...
## $ Age : num 61 74 70 76 79 89 66 70 83 76 ...
## $ Tissue : chr "pigmented layer of retina" "pigmented layer of retina" "pigmented layer of retina" "pigmented layer of retina" ...
## $ Row : Factor w/ 12 levels "R01C01","R01C02",..: 3 10 12 1 3 5 7 9 11 2 ...
## $ Chip : Factor w/ 4 levels "200723300084",..: 2 2 2 4 4 4 4 4 4 4 ...
## $ Basename : chr "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200723300089_R02C01" "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200723300089_R05C02" "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200723300089_R06C02" "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200770460039_R01C01" ...
## $ filenames : chr "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200723300089_R02C01" "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200723300089_R05C02" "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200723300089_R06C02" "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200770460039_R01C01" ...
## $ xMed : num 11.4 11.7 11.9 11.7 11.6 ...
## $ yMed : num 11.7 12 12.1 11.9 11.9 ...
## $ predictedSex : chr "M" "M" "M" "M" ...
## $ SV : num -0.1222 -0.011 -0.1074 -0.073 -0.0265 ...
## $ Epithelial : num 0.387 0.436 0.367 0.413 0.387 ...
## $ Fibroblast : num 0.326 0.316 0.298 0.24 0.312 ...
## $ B_Cell : num 0.0502 0.0454 0.0582 0.0528 0.0482 ...
## $ NK_Cell : num 0.065 0.0601 0.0816 0.0986 0.0743 ...
## $ CD4T : num 0.0478 0.0467 0.0545 0.0658 0.0544 ...
## $ CD8T : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Monocyte : num 0.0685 0.05 0.081 0.0666 0.0651 ...
## $ Neutrophil : num 0 0 0 0 0 ...
## $ Eosinophil : num 0.0555 0.0457 0.0597 0.0632 0.0597 ...
#Check order.
identical(rownames(AMD_pData), colnames(M_values.funnorm.filt)) #TRUE. ## [1] TRUE
#Sanity check - there should be no NAs or infinite numbers - which could be a result of logit transformation of 0 or 1 beta values.
all(complete.cases(M_values.funnorm.filt)) == "TRUE" #TRUE - meaning no NA or infinite numbers. ## [1] TRUE
#Subset AMD samples only.
AMD <- subset(AMD_pData, Disease_State == "age-related macular degeneration")
dim(AMD) #25 samples.## [1] 25 22
AMD_M_values.funnorm.filt <- M_values.funnorm.filt[, colnames(M_values.funnorm.filt) %in% rownames(AMD)]
dim(AMD_M_values.funnorm.filt) #425456 probes, 25 samples.## [1] 425456 25
rm(AMD_pData)
rm(M_values.funnorm.filt)library(pbapply) #Progress bar for apply functions.
#EWAS on Age - All samples + Sex + SV + Chip.
#LM: Need to use transformed M-values instead of beta values as it is more statistically sound.
AMD.only_Age.Sex.SV.chip_LM_pval <- pbsapply(1:nrow(AMD_M_values.funnorm.filt), function(CpG){
meta <- AMD
meta$Mval <- AMD_M_values.funnorm.filt[CpG,]
mod_AMD.only_Age.Sex.SV.chip <- lm(Mval ~ Age + Sex + SV + Chip, data = meta) #Only Sex + SV + Chip as covariate.
coef(summary(mod_AMD.only_Age.Sex.SV.chip))[2,4]}) #Returns nominal p-value for Age for model at each CpG.
head(AMD.only_Age.Sex.SV.chip_LM_pval)
save(AMD.only_Age.Sex.SV.chip_LM_pval, file = "~/KoborLab/kobor_space/kendrix/macular_degeneration/data/AMD.only_Age.Sex.SV.chip_LM_pval.RData")load("~/KoborLab/kobor_space/kendrix/macular_degeneration/data/AMD.only_Age.Sex.SV.chip_LM_pval.RData")
#Inspect p-value distribution for model.
pvalue_dist_AMD.only_Age.Sex.SV.chip <- data.frame(CpG = rownames(AMD_M_values.funnorm.filt), Nominal_P = AMD.only_Age.Sex.SV.chip_LM_pval)
ggplot(pvalue_dist_AMD.only_Age.Sex.SV.chip, aes(Nominal_P)) +
geom_histogram(fill = "grey90", color = "black") +
theme_classic() + xlab("Nominal P Value") +
ylim(0, 20000) +
xlim(min(AMD.only_Age.Sex.SV.chip_LM_pval), max(AMD.only_Age.Sex.SV.chip_LM_pval))## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values (geom_bar).
#Not right-skewed. Distribution is a little even.
#Multiple test correction with FDR.
AMD_M_values.funnorm.filt <- as.data.frame(AMD_M_values.funnorm.filt)
Multi_test_corr_relaxed <- p.adjust(AMD.only_Age.Sex.SV.chip_LM_pval, method = "fdr", n = length(AMD.only_Age.Sex.SV.chip_LM_pval))
#Looking at FDR thresholds for hits:
dim(as.data.frame(AMD_M_values.funnorm.filt)[which(Multi_test_corr_relaxed <= 0.05),]) #0 at 0.05.## [1] 0 25
dim(as.data.frame(AMD_M_values.funnorm.filt)[which(Multi_test_corr_relaxed <= 0.1),]) #0 at 0.1.## [1] 2 25
dim(as.data.frame(AMD_M_values.funnorm.filt)[which(Multi_test_corr_relaxed <= 0.2),]) #2 at 0.2.## [1] 2 25
#Looking at top hits by nominal P:
pvalue_dist_AMD.only_Age.Sex.SV.chip <- pvalue_dist_AMD.only_Age.Sex.SV.chip[order(pvalue_dist_AMD.only_Age.Sex.SV.chip$Nominal_P),]
head(pvalue_dist_AMD.only_Age.Sex.SV.chip)## CpG Nominal_P
## 206218 cg07306755 1.390008e-07
## 215414 cg14625731 4.676746e-07
## 296713 cg06968164 4.794239e-06
## 106709 cg26845297 6.235855e-06
## 27468 cg24155129 6.972155e-06
## 409036 cg06049791 7.078629e-06
#Load 450K annotation data.
load("~/KoborLab/kobor_space/shared_coding_resource/Illumina_EPIC/EPIC_Annotation_Complete.RData")
head(EPIC_Annotation_Complete)## Name AddressA_ID
## cg07881041 cg07881041 0085713262
## cg18478105 cg18478105 0046761277
## cg23229610 cg23229610 0021717843
## cg03513874 cg03513874 0029622133
## cg09835024 cg09835024 0016745152
## cg05451842 cg05451842 0016681196
## AlleleA_ProbeSeq AddressB_ID
## cg07881041 CTACAAATACAACACCCTCAACCCATATTTCATATATTATCTCATTTAAC
## cg18478105 AAATAAATTTCACTCTCAAATCCCAATCTCATACAACAAAACAAAAACCA 0086644198
## cg23229610 ATAAAATTCTTTCCTTAAAAAACATTAAAACCAAAATAAACAAAAATTCC
## cg03513874 ACAATAAAATAATAAAATCCCATCACTACTTACCCTCCTTAAATAATATC
## cg09835024 AATAAACACCAACCCCAAACCAATCTCACTTTATTAAATTACAAAAATCA 0081631976
## cg05451842 CRTTCAAATACACTATAACCCRACTAAAAAAACCCCCAACAACCCAAAAC
## AlleleB_ProbeSeq
## cg07881041
## cg18478105 AAATAAATTTCGCTCTCAAATCCCAATCTCGTACGACGAAACGAAAACCG
## cg23229610
## cg03513874
## cg09835024 AATAAACGCCGACCCCGAACCGATCTCGCTTTATTAAATTACAAAAATCG
## cg05451842
## Infinium_Design_Type Next_Base Color_Channel
## cg07881041 II
## cg18478105 I C Grn
## cg23229610 II
## cg03513874 II
## cg09835024 I A Red
## cg05451842 II
## Forward_Sequence
## cg07881041 CTGCACGCCTACTGCAGGTGCAGCACCCTCAGCCCATGTTTCATGTATTATCTCATTTAA[CG]CATGTATCATCTCATTTAATGCATGCATTATCTCATTTAATTCTCACAACCCCTCAGGTG
## cg18478105 TCCCGTCTTACGGGATGGATTTCGCTCTCAGGTCCCAGTCTCGTGCGGCGGGGCGGGGAC[CG]CAGCCGGCTGGGCGGGGAAGCCCTGAGCCGGGGAAGTCACGTGGGGCGTGTCCGGAGGCG
## cg23229610 GTTTCTGGACAGTAAAATTCTTTCCTTGAAGGACATTAGGGCCAAAATGGGCAAGGATTC[CG]AGATTGGTACATCGAGCGTTATCTTCCAACTCTCTTTTCTAAATGGGCTCATTTAGTAAT
## cg03513874 ATTGTGCCCACCTTGCTGCTGACAGTTAAGCATCACTAAAGTAGGAAATAGGGTCCAAAC[CG]ACACTACTTAAGGAGGGCAAGTAGTGATGGGACCTCATCATCCCATTGCTATCATGGAGC
## cg09835024 AGCCCCGTCATAGGTGGGCGCCGACCCCGAGCCGATCTCGCTTTATTAAATTACAGAAAT[CG]GTATTCAAAAAAAAAAAAAAAAAAGGGCGGGGAGGACACTCCCTCTTCTCTGTTCCCACA
## cg05451842 CACAGCGTGGATGCCCCGATTTCCCAGGTCCCTCCGCAACCCTCAGTAGAACTCCCACCG[CG]CCCTGGGCTGCTGGGGGCCTCCCCAGCCGGGTCACAGTGCACCTGAACGCCCCGGTCCGT
## Genome_Build CHR MAPINFO
## cg07881041 37 19 5236016
## cg18478105 37 20 61847650
## cg23229610 37 1 6841125
## cg03513874 37 2 198303466
## cg09835024 37 X 24072640
## cg05451842 37 14 93581139
## SourceSeq Strand
## cg07881041 TGCAGGTGCAGCACCCTCAGCCCATGTTTCATGTATTATCTCATTTAACG R
## cg18478105 CGGTCCCCGCCCCGCCGCACGAGACTGGGACCTGAGAGCGAAATCCATCC R
## cg23229610 CGGAATCCTTGCCCATTTTGGCCCTAATGTCCTTCAAGGAAAGAATTTTA R
## cg03513874 CAATGGGATGATGAGGTCCCATCACTACTTGCCCTCCTTAAGTAGTGTCG F
## cg09835024 GGTGGGCGCCGACCCCGAGCCGATCTCGCTTTATTAAATTACAGAAATCG R
## cg05451842 CGCCCTGGGCTGCTGGGGGCCTCCCCAGCCGGGTCACAGTGCACCTGAAC F
## UCSC_RefGene_Name UCSC_RefGene_Accession
## cg07881041 PTPRS;PTPRS;PTPRS;PTPRS NM_130855;NM_002850;NM_130854;NM_130853
## cg18478105 YTHDF1 NM_017798
## cg23229610
## cg03513874
## cg09835024 EIF2S3 NM_001415
## cg05451842 ITPK1;ITPK1;ITPK1 NM_001142593;NM_014216;NM_001142594
## UCSC_RefGene_Group UCSC_CpG_Islands_Name
## cg07881041 Body;Body;Body;Body chr19:5237294-5237669
## cg18478105 TSS200 chr20:61846843-61848103
## cg23229610 chr1:6844313-6846366
## cg03513874 chr2:198299244-198299972
## cg09835024 TSS1500 chrX:24072558-24073135
## cg05451842 Body;Body;Body chr14:93581083-93582797
## Relation_to_UCSC_CpG_Island Phantom4_Enhancers Phantom5_Enhancers
## cg07881041 N_Shore
## cg18478105 Island
## cg23229610 N_Shelf
## cg03513874 S_Shelf
## cg09835024 Island
## cg05451842 Island
## DMR X450k_Enhancer HMM_Island Regulatory_Feature_Name
## cg07881041 NA
## cg18478105 NA 20:61317142-61318498 20:61846284-61847956
## cg23229610 NA
## cg03513874 NA
## cg09835024 NA X:24071907-24073667
## cg05451842 NA 14:92650663-92652544
## Regulatory_Feature_Group GencodeBasicV12_NAME
## cg07881041
## cg18478105 Promoter_Associated YTHDF1;YTHDF1
## cg23229610
## cg03513874
## cg09835024 Promoter_Associated EIF2S3
## cg05451842 ITPK1
## GencodeBasicV12_Accession GencodeBasicV12_Group
## cg07881041
## cg18478105 ENST00000370334.4;ENST00000370339.3 TSS200;TSS200
## cg23229610
## cg03513874
## cg09835024 ENST00000253039.4 TSS200
## cg05451842 ENST00000555495.1 5'UTR
## GencodeCompV12_NAME
## cg07881041
## cg18478105 YTHDF1;YTHDF1
## cg23229610
## cg03513874
## cg09835024 EIF2S3;EIF2S3;EIF2S3
## cg05451842 ITPK1
## GencodeCompV12_Accession
## cg07881041
## cg18478105 ENST00000370334.4;ENST00000370339.3
## cg23229610
## cg03513874
## cg09835024 ENST00000487075.1;ENST00000423068.1;ENST00000253039.4
## cg05451842 ENST00000555495.1
## GencodeCompV12_Group DNase_Hypersensitivity_NAME
## cg07881041
## cg18478105 TSS200;TSS200 chr20:61847520-61847755
## cg23229610
## cg03513874
## cg09835024 TSS1500;TSS1500;TSS200 chrX:24072600-24073395
## cg05451842 5'UTR chr14:93581080-93581375
## DNase_Hypersensitivity_Evidence_Count OpenChromatin_NAME
## cg07881041 NA
## cg18478105 3
## cg23229610 NA
## cg03513874 NA
## cg09835024 3
## cg05451842 3
## OpenChromatin_Evidence_Count TFBS_NAME TFBS_Evidence_Count
## cg07881041 NA NA
## cg18478105 NA NA
## cg23229610 NA NA
## cg03513874 NA NA
## cg09835024 NA NA
## cg05451842 NA NA
## Methyl27_Loci Methyl450_Loci Chromosome_36 Coordinate_36
## cg07881041 NA TRUE 19 5187016
## cg18478105 NA TRUE 20 61318095
## cg23229610 NA TRUE 1 6763712
## cg03513874 NA TRUE 2 198011711
## cg09835024 NA TRUE X 23982561
## cg05451842 NA TRUE 14 92650892
## SNP_ID SNP_DISTANCE SNP_MinorAlleleFrequency
## cg07881041 rs187313142 18 0.000200
## cg18478105 rs549944121 5 0.001797
## cg23229610 rs545824288;rs527255711 40;12 0.000200;0.001198
## cg03513874
## cg09835024
## cg05451842 rs550745821 22 0.000200
## Random_Loci X strand Probe_rs Probe_maf CpG_rs CpG_maf SBE_rs
## cg07881041 NA NA - <NA> NA <NA> NA <NA>
## cg18478105 NA NA - <NA> NA <NA> NA <NA>
## cg23229610 NA NA - <NA> NA <NA> NA <NA>
## cg03513874 NA NA + <NA> NA <NA> NA <NA>
## cg09835024 NA NA - <NA> NA <NA> NA <NA>
## cg05451842 NA NA + <NA> NA <NA> NA <NA>
## SBE_maf CH_450_XY CH_450_Aut CH_EPIC Cross_Hyb
## cg07881041 NA No No No No
## cg18478105 NA No No No No
## cg23229610 NA No No No No
## cg03513874 NA No No No No
## cg09835024 NA No No No No
## cg05451842 NA No No No No
dim(hits_CpGs <- pvalue_dist_AMD.only_Age.Sex.SV.chip[which(pvalue_dist_AMD.only_Age.Sex.SV.chip$Nominal_P < 1e-6),]) #2 hits.## [1] 2 2
hits <- EPIC_Annotation_Complete[which(EPIC_Annotation_Complete$Name%in%hits_CpGs$CpG),]
hits$UCSC_RefGene_Name## [1] GEM;GEM
## 66070 Levels: A1BG A1BG-AS1;A1BG A1BG-AS1;A1BG;ZNF497;ZNF497 ... ZZZ3;ZZZ3;ZZZ3
#Delta beta.
#Using Maggie's code for deltabeta:
deltabeta <- function(df, mainvar, covar1 = NULL, covar2 = NULL, covar3 = NULL, covar4 = NULL, covar5 = NULL) {
# Calculating delta beta of the main variable of interest (mainvar), with up to 5 possible covariates (covar)
# mainvar should be a vector of continuous variable
# all covars should also be vectors
# df = dataframe or matrix of beta values
# output is a vector of delta beta values
sd=sd(mainvar)
qt <-
range <- max(mainvar, na.rm = T) - min(mainvar, na.rm = T)
dB <- vector(mode = "numeric", length = nrow(df))
names(dB) <- rownames(df)
for (i in 1:nrow(df)) {
beta <- df[i, ]
if (is.null(covar1)) {
mod <- lm(beta ~ mainvar)
} else if (is.null(covar2)) {
mod <- lm(beta ~ mainvar + covar1)
} else if (is.null(covar3)) {
mod <- lm(beta ~ mainvar + covar1 + covar2)
} else if (is.null(covar4)) {
mod <- lm(beta ~ mainvar + covar1 + covar2 + covar3)
} else if (is.null(covar5)) {
mod <- lm(beta ~ mainvar + covar1 + covar2 + covar3 + covar4)
} else {
mod <- lm(beta ~ mainvar + covar1 + covar2 + covar3 + covar4 + covar5)
}
slope <- mod$coefficients[2]
dB[i] <- as.numeric(slope*range)
}
dB
}
AMD_betas.funnorm.filt <- m2beta(AMD_M_values.funnorm.filt)
delta_beta_AMD.only_Age.Sex.SV.chip_fixed <- deltabeta(as.matrix(AMD_betas.funnorm.filt), AMD$Age, covar1 = AMD$Sex, covar2 = AMD$SV, covar3 = AMD$Chip)
length(delta_beta_AMD.only_Age.Sex.SV.chip_fixed)
summary(delta_beta_AMD.only_Age.Sex.SV.chip_fixed)
save(delta_beta_AMD.only_Age.Sex.SV.chip_fixed, file = "~/KoborLab/kobor_space/kendrix/macular_degeneration/data/DB_AMD.only_Age.Sex.SV.chip_fixed.RData")Chip as covariate.
#Load all the objects.
load("~/KoborLab/kobor_space/kendrix/macular_degeneration/data/AMD_SVA_M_values.funnorm.RData")
dim(AMD_pData) #44 samples.## [1] 44 22
dim(M_values.funnorm.filt) #425456 probes.## [1] 425456 44
AMD_pData$Disease_State <- as.factor(AMD_pData$Disease_State)
AMD_pData$Sex <- as.factor(AMD_pData$Sex)
AMD_pData$Row <- as.factor(AMD_pData$Row)
AMD_pData$Chip <- as.factor(AMD_pData$Chip)
AMD_pData$Age <- as.numeric(AMD_pData$Age)
str(AMD_pData)## 'data.frame': 44 obs. of 22 variables:
## $ Sample_Name : chr "Sample 1" "Sample 10" "Sample 11" "Sample 12" ...
## $ Disease_State: Factor w/ 2 levels "age-related macular degeneration",..: 2 2 2 1 1 1 2 1 1 1 ...
## $ Sex : Factor w/ 2 levels "F","M": 2 2 2 2 2 1 2 2 1 2 ...
## $ Age : num 61 74 70 76 79 89 66 70 83 76 ...
## $ Tissue : chr "pigmented layer of retina" "pigmented layer of retina" "pigmented layer of retina" "pigmented layer of retina" ...
## $ Row : Factor w/ 12 levels "R01C01","R01C02",..: 3 10 12 1 3 5 7 9 11 2 ...
## $ Chip : Factor w/ 4 levels "200723300084",..: 2 2 2 4 4 4 4 4 4 4 ...
## $ Basename : chr "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200723300089_R02C01" "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200723300089_R05C02" "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200723300089_R06C02" "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200770460039_R01C01" ...
## $ filenames : chr "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200723300089_R02C01" "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200723300089_R05C02" "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200723300089_R06C02" "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200770460039_R01C01" ...
## $ xMed : num 11.4 11.7 11.9 11.7 11.6 ...
## $ yMed : num 11.7 12 12.1 11.9 11.9 ...
## $ predictedSex : chr "M" "M" "M" "M" ...
## $ SV : num -0.1222 -0.011 -0.1074 -0.073 -0.0265 ...
## $ Epithelial : num 0.387 0.436 0.367 0.413 0.387 ...
## $ Fibroblast : num 0.326 0.316 0.298 0.24 0.312 ...
## $ B_Cell : num 0.0502 0.0454 0.0582 0.0528 0.0482 ...
## $ NK_Cell : num 0.065 0.0601 0.0816 0.0986 0.0743 ...
## $ CD4T : num 0.0478 0.0467 0.0545 0.0658 0.0544 ...
## $ CD8T : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Monocyte : num 0.0685 0.05 0.081 0.0666 0.0651 ...
## $ Neutrophil : num 0 0 0 0 0 ...
## $ Eosinophil : num 0.0555 0.0457 0.0597 0.0632 0.0597 ...
#Check order.
identical(rownames(AMD_pData), colnames(M_values.funnorm.filt)) #TRUE. ## [1] TRUE
#Sanity check - there should be no NAs or infinite numbers - which could be a result of logit transformation of 0 or 1 beta values.
all(complete.cases(M_values.funnorm.filt)) == "TRUE" #TRUE - meaning no NA or infinite numbers. ## [1] TRUE
library(pbapply) #Progress bar for apply functions.
#EWAS on Age - All samples + Chip.
#LM: Need to use transformed M-values instead of beta values as it is more statistically sound.
Age.chip_LM_pval <- pbsapply(1:nrow(M_values.funnorm.filt), function(CpG){
meta <- AMD_pData
meta$Mval <- M_values.funnorm.filt[CpG,]
mod_Age.chip <- lm(Mval ~ Age + Chip, data = meta) #Only Chip as covariate.
coef(summary(mod_Age.chip))[2,4]}) #Returns nominal p-value for Age for model at each CpG.
head(Age.chip_LM_pval)
save(Age.chip_LM_pval, file = "~/KoborLab/kobor_space/kendrix/macular_degeneration/data/Age.chip_LM_pval.RData")load("~/KoborLab/kobor_space/kendrix/macular_degeneration/data/Age.chip_LM_pval.RData")
#Inspect p-value distribution for model.
pvalue_dist_Age.chip <- data.frame(CpG = rownames(M_values.funnorm.filt), Nominal_P = Age.chip_LM_pval)
ggplot(pvalue_dist_Age.chip, aes(Nominal_P)) +
geom_histogram(fill = "grey90", color = "black") +
theme_classic() + xlab("Nominal P Value") +
ylim(0, 20000) +
xlim(min(Age.chip_LM_pval), max(Age.chip_LM_pval))## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values (geom_bar).
#Not right-skewed. Distribution is a little even.
#Multiple test correction with FDR.
M_values.funnorm.filt <- as.data.frame(M_values.funnorm.filt)
Multi_test_corr_relaxed <- p.adjust(Age.chip_LM_pval, method = "fdr", n = length(Age.chip_LM_pval))
#Looking at FDR thresholds for hits:
dim(as.data.frame(M_values.funnorm.filt)[which(Multi_test_corr_relaxed <= 0.05),]) #0 at 0.05.## [1] 0 44
dim(as.data.frame(M_values.funnorm.filt)[which(Multi_test_corr_relaxed <= 0.1),]) #0 at 0.1.## [1] 0 44
dim(as.data.frame(M_values.funnorm.filt)[which(Multi_test_corr_relaxed <= 0.2),]) #32 at 0.2.## [1] 32 44
#Looking at top hits by nominal P:
pvalue_dist_Age.chip <- pvalue_dist_Age.chip[order(pvalue_dist_Age.chip$Nominal_P),]
head(pvalue_dist_Age.chip)## CpG Nominal_P
## 75729 cg15155209 5.551609e-07
## 100675 cg25359907 1.027507e-06
## 296713 cg06968164 1.297871e-06
## 323108 cg19005438 1.445053e-06
## 43977 cg06173889 2.267861e-06
## 51300 cg22197050 2.610535e-06
#Load 450K annotation data.
load("~/KoborLab/kobor_space/shared_coding_resource/Illumina_EPIC/EPIC_Annotation_Complete.RData")
head(EPIC_Annotation_Complete)## Name AddressA_ID
## cg07881041 cg07881041 0085713262
## cg18478105 cg18478105 0046761277
## cg23229610 cg23229610 0021717843
## cg03513874 cg03513874 0029622133
## cg09835024 cg09835024 0016745152
## cg05451842 cg05451842 0016681196
## AlleleA_ProbeSeq AddressB_ID
## cg07881041 CTACAAATACAACACCCTCAACCCATATTTCATATATTATCTCATTTAAC
## cg18478105 AAATAAATTTCACTCTCAAATCCCAATCTCATACAACAAAACAAAAACCA 0086644198
## cg23229610 ATAAAATTCTTTCCTTAAAAAACATTAAAACCAAAATAAACAAAAATTCC
## cg03513874 ACAATAAAATAATAAAATCCCATCACTACTTACCCTCCTTAAATAATATC
## cg09835024 AATAAACACCAACCCCAAACCAATCTCACTTTATTAAATTACAAAAATCA 0081631976
## cg05451842 CRTTCAAATACACTATAACCCRACTAAAAAAACCCCCAACAACCCAAAAC
## AlleleB_ProbeSeq
## cg07881041
## cg18478105 AAATAAATTTCGCTCTCAAATCCCAATCTCGTACGACGAAACGAAAACCG
## cg23229610
## cg03513874
## cg09835024 AATAAACGCCGACCCCGAACCGATCTCGCTTTATTAAATTACAAAAATCG
## cg05451842
## Infinium_Design_Type Next_Base Color_Channel
## cg07881041 II
## cg18478105 I C Grn
## cg23229610 II
## cg03513874 II
## cg09835024 I A Red
## cg05451842 II
## Forward_Sequence
## cg07881041 CTGCACGCCTACTGCAGGTGCAGCACCCTCAGCCCATGTTTCATGTATTATCTCATTTAA[CG]CATGTATCATCTCATTTAATGCATGCATTATCTCATTTAATTCTCACAACCCCTCAGGTG
## cg18478105 TCCCGTCTTACGGGATGGATTTCGCTCTCAGGTCCCAGTCTCGTGCGGCGGGGCGGGGAC[CG]CAGCCGGCTGGGCGGGGAAGCCCTGAGCCGGGGAAGTCACGTGGGGCGTGTCCGGAGGCG
## cg23229610 GTTTCTGGACAGTAAAATTCTTTCCTTGAAGGACATTAGGGCCAAAATGGGCAAGGATTC[CG]AGATTGGTACATCGAGCGTTATCTTCCAACTCTCTTTTCTAAATGGGCTCATTTAGTAAT
## cg03513874 ATTGTGCCCACCTTGCTGCTGACAGTTAAGCATCACTAAAGTAGGAAATAGGGTCCAAAC[CG]ACACTACTTAAGGAGGGCAAGTAGTGATGGGACCTCATCATCCCATTGCTATCATGGAGC
## cg09835024 AGCCCCGTCATAGGTGGGCGCCGACCCCGAGCCGATCTCGCTTTATTAAATTACAGAAAT[CG]GTATTCAAAAAAAAAAAAAAAAAAGGGCGGGGAGGACACTCCCTCTTCTCTGTTCCCACA
## cg05451842 CACAGCGTGGATGCCCCGATTTCCCAGGTCCCTCCGCAACCCTCAGTAGAACTCCCACCG[CG]CCCTGGGCTGCTGGGGGCCTCCCCAGCCGGGTCACAGTGCACCTGAACGCCCCGGTCCGT
## Genome_Build CHR MAPINFO
## cg07881041 37 19 5236016
## cg18478105 37 20 61847650
## cg23229610 37 1 6841125
## cg03513874 37 2 198303466
## cg09835024 37 X 24072640
## cg05451842 37 14 93581139
## SourceSeq Strand
## cg07881041 TGCAGGTGCAGCACCCTCAGCCCATGTTTCATGTATTATCTCATTTAACG R
## cg18478105 CGGTCCCCGCCCCGCCGCACGAGACTGGGACCTGAGAGCGAAATCCATCC R
## cg23229610 CGGAATCCTTGCCCATTTTGGCCCTAATGTCCTTCAAGGAAAGAATTTTA R
## cg03513874 CAATGGGATGATGAGGTCCCATCACTACTTGCCCTCCTTAAGTAGTGTCG F
## cg09835024 GGTGGGCGCCGACCCCGAGCCGATCTCGCTTTATTAAATTACAGAAATCG R
## cg05451842 CGCCCTGGGCTGCTGGGGGCCTCCCCAGCCGGGTCACAGTGCACCTGAAC F
## UCSC_RefGene_Name UCSC_RefGene_Accession
## cg07881041 PTPRS;PTPRS;PTPRS;PTPRS NM_130855;NM_002850;NM_130854;NM_130853
## cg18478105 YTHDF1 NM_017798
## cg23229610
## cg03513874
## cg09835024 EIF2S3 NM_001415
## cg05451842 ITPK1;ITPK1;ITPK1 NM_001142593;NM_014216;NM_001142594
## UCSC_RefGene_Group UCSC_CpG_Islands_Name
## cg07881041 Body;Body;Body;Body chr19:5237294-5237669
## cg18478105 TSS200 chr20:61846843-61848103
## cg23229610 chr1:6844313-6846366
## cg03513874 chr2:198299244-198299972
## cg09835024 TSS1500 chrX:24072558-24073135
## cg05451842 Body;Body;Body chr14:93581083-93582797
## Relation_to_UCSC_CpG_Island Phantom4_Enhancers Phantom5_Enhancers
## cg07881041 N_Shore
## cg18478105 Island
## cg23229610 N_Shelf
## cg03513874 S_Shelf
## cg09835024 Island
## cg05451842 Island
## DMR X450k_Enhancer HMM_Island Regulatory_Feature_Name
## cg07881041 NA
## cg18478105 NA 20:61317142-61318498 20:61846284-61847956
## cg23229610 NA
## cg03513874 NA
## cg09835024 NA X:24071907-24073667
## cg05451842 NA 14:92650663-92652544
## Regulatory_Feature_Group GencodeBasicV12_NAME
## cg07881041
## cg18478105 Promoter_Associated YTHDF1;YTHDF1
## cg23229610
## cg03513874
## cg09835024 Promoter_Associated EIF2S3
## cg05451842 ITPK1
## GencodeBasicV12_Accession GencodeBasicV12_Group
## cg07881041
## cg18478105 ENST00000370334.4;ENST00000370339.3 TSS200;TSS200
## cg23229610
## cg03513874
## cg09835024 ENST00000253039.4 TSS200
## cg05451842 ENST00000555495.1 5'UTR
## GencodeCompV12_NAME
## cg07881041
## cg18478105 YTHDF1;YTHDF1
## cg23229610
## cg03513874
## cg09835024 EIF2S3;EIF2S3;EIF2S3
## cg05451842 ITPK1
## GencodeCompV12_Accession
## cg07881041
## cg18478105 ENST00000370334.4;ENST00000370339.3
## cg23229610
## cg03513874
## cg09835024 ENST00000487075.1;ENST00000423068.1;ENST00000253039.4
## cg05451842 ENST00000555495.1
## GencodeCompV12_Group DNase_Hypersensitivity_NAME
## cg07881041
## cg18478105 TSS200;TSS200 chr20:61847520-61847755
## cg23229610
## cg03513874
## cg09835024 TSS1500;TSS1500;TSS200 chrX:24072600-24073395
## cg05451842 5'UTR chr14:93581080-93581375
## DNase_Hypersensitivity_Evidence_Count OpenChromatin_NAME
## cg07881041 NA
## cg18478105 3
## cg23229610 NA
## cg03513874 NA
## cg09835024 3
## cg05451842 3
## OpenChromatin_Evidence_Count TFBS_NAME TFBS_Evidence_Count
## cg07881041 NA NA
## cg18478105 NA NA
## cg23229610 NA NA
## cg03513874 NA NA
## cg09835024 NA NA
## cg05451842 NA NA
## Methyl27_Loci Methyl450_Loci Chromosome_36 Coordinate_36
## cg07881041 NA TRUE 19 5187016
## cg18478105 NA TRUE 20 61318095
## cg23229610 NA TRUE 1 6763712
## cg03513874 NA TRUE 2 198011711
## cg09835024 NA TRUE X 23982561
## cg05451842 NA TRUE 14 92650892
## SNP_ID SNP_DISTANCE SNP_MinorAlleleFrequency
## cg07881041 rs187313142 18 0.000200
## cg18478105 rs549944121 5 0.001797
## cg23229610 rs545824288;rs527255711 40;12 0.000200;0.001198
## cg03513874
## cg09835024
## cg05451842 rs550745821 22 0.000200
## Random_Loci X strand Probe_rs Probe_maf CpG_rs CpG_maf SBE_rs
## cg07881041 NA NA - <NA> NA <NA> NA <NA>
## cg18478105 NA NA - <NA> NA <NA> NA <NA>
## cg23229610 NA NA - <NA> NA <NA> NA <NA>
## cg03513874 NA NA + <NA> NA <NA> NA <NA>
## cg09835024 NA NA - <NA> NA <NA> NA <NA>
## cg05451842 NA NA + <NA> NA <NA> NA <NA>
## SBE_maf CH_450_XY CH_450_Aut CH_EPIC Cross_Hyb
## cg07881041 NA No No No No
## cg18478105 NA No No No No
## cg23229610 NA No No No No
## cg03513874 NA No No No No
## cg09835024 NA No No No No
## cg05451842 NA No No No No
dim(hits_CpGs <- pvalue_dist_Age.chip[which(pvalue_dist_Age.chip$Nominal_P < 1e-6),]) #1 hits.## [1] 1 2
hits <- EPIC_Annotation_Complete[which(EPIC_Annotation_Complete$Name%in%hits_CpGs$CpG),]
hits$UCSC_RefGene_Name## [1] SLC6A6;SLC6A6
## 66070 Levels: A1BG A1BG-AS1;A1BG A1BG-AS1;A1BG;ZNF497;ZNF497 ... ZZZ3;ZZZ3;ZZZ3
#Delta beta.
#Using Maggie's code for deltabeta:
deltabeta <- function(df, mainvar, covar1 = NULL, covar2 = NULL, covar3 = NULL, covar4 = NULL, covar5 = NULL) {
# Calculating delta beta of the main variable of interest (mainvar), with up to 5 possible covariates (covar)
# mainvar should be a vector of continuous variable
# all covars should also be vectors
# df = dataframe or matrix of beta values
# output is a vector of delta beta values
sd=sd(mainvar)
qt <-
range <- max(mainvar, na.rm = T) - min(mainvar, na.rm = T)
dB <- vector(mode = "numeric", length = nrow(df))
names(dB) <- rownames(df)
for (i in 1:nrow(df)) {
beta <- df[i, ]
if (is.null(covar1)) {
mod <- lm(beta ~ mainvar)
} else if (is.null(covar2)) {
mod <- lm(beta ~ mainvar + covar1)
} else if (is.null(covar3)) {
mod <- lm(beta ~ mainvar + covar1 + covar2)
} else if (is.null(covar4)) {
mod <- lm(beta ~ mainvar + covar1 + covar2 + covar3)
} else if (is.null(covar5)) {
mod <- lm(beta ~ mainvar + covar1 + covar2 + covar3 + covar4)
} else {
mod <- lm(beta ~ mainvar + covar1 + covar2 + covar3 + covar4 + covar5)
}
slope <- mod$coefficients[2]
dB[i] <- as.numeric(slope*range)
}
dB
}
betas.funnorm.filt <- m2beta(M_values.funnorm.filt)
delta_beta_Age.chip_fixed <- deltabeta(as.matrix(betas.funnorm.filt), AMD_pData$Age, covar1 = AMD_pData$Chip)
length(delta_beta_Age.chip_fixed)
summary(delta_beta_Age.chip_fixed)
save(delta_beta_Age.chip_fixed, file = "~/KoborLab/kobor_space/kendrix/macular_degeneration/data/DB_Age.chip_fixed.RData")##3. Linear Model: Volcano Plot
#Volcano to examine hits (for DB, see below chunks):
load("~/KoborLab/kobor_space/kendrix/macular_degeneration/data/DB_Age.chip_fixed.RData")
#Call Volcano (Nominal p Version, modified from Rachel's code):
source("/home/BCRICWH.LAN/dlin/KoborLab/kobor_space/cake/home/dlin/Volcano_DL_Nominal.R")
#After running the last 2 chunks, make a summary table with CpG, Nominal_P, FDR, and Delta Beta.
Age.chip_Table <- data.frame(rownames(M_values.funnorm.filt), Age.chip_LM_pval, Multi_test_corr_relaxed, delta_beta_Age.chip_fixed)
colnames(Age.chip_Table) = c("CpG", "Nominal_P", "FDR", "Delta_Beta")
identical(as.character(rownames(Age.chip_Table)), as.character(Age.chip_Table$CpG)) #TRUE.## [1] TRUE
#Looking at top hits quickly without considering DB:
head(Age.chip_Table[order(Age.chip_Table$Nominal_P),],10)## CpG Nominal_P FDR Delta_Beta
## cg15155209 cg15155209 5.551609e-07 0.1537017 -0.19917332
## cg25359907 cg25359907 1.027507e-06 0.1537017 0.04889796
## cg06968164 cg06968164 1.297871e-06 0.1537017 -0.16568531
## cg19005438 cg19005438 1.445053e-06 0.1537017 0.15735573
## cg06173889 cg06173889 2.267861e-06 0.1619278 0.23625113
## cg22197050 cg22197050 2.610535e-06 0.1619278 0.06999899
## cg09981830 cg09981830 2.882339e-06 0.1619278 -0.15967392
## cg21279207 cg21279207 3.364030e-06 0.1619278 0.04648695
## cg27239981 cg27239981 3.425384e-06 0.1619278 0.17259425
## cg15861585 cg15861585 4.499119e-06 0.1759085 0.22678014
##Setting a threshold of 0.05DB, 5e-6 Nominal P (scale to 0.60DB):
makeVolcano_nominal(Age.chip_Table$Nominal_P, Age.chip_Table$Delta_Beta, 0.05, 5e-6, "DNAm changes", 0.5) #at 5e-6: 9 Hypermethylated, 0 Hypomethylated## [1] "Hypermethylated: 5"
## [1] "Hypomethylated: 4"
## Warning: Removed 2 rows containing missing values (geom_point).
#What are these hits?
#First make an annotated table - load 450K manifest.
load("~/KoborLab/kobor_space/shared_coding_resource/Illumina_EPIC/EPIC_Annotation_Complete.RData")
head(EPIC_Annotation_Complete)## Name AddressA_ID
## cg07881041 cg07881041 0085713262
## cg18478105 cg18478105 0046761277
## cg23229610 cg23229610 0021717843
## cg03513874 cg03513874 0029622133
## cg09835024 cg09835024 0016745152
## cg05451842 cg05451842 0016681196
## AlleleA_ProbeSeq AddressB_ID
## cg07881041 CTACAAATACAACACCCTCAACCCATATTTCATATATTATCTCATTTAAC
## cg18478105 AAATAAATTTCACTCTCAAATCCCAATCTCATACAACAAAACAAAAACCA 0086644198
## cg23229610 ATAAAATTCTTTCCTTAAAAAACATTAAAACCAAAATAAACAAAAATTCC
## cg03513874 ACAATAAAATAATAAAATCCCATCACTACTTACCCTCCTTAAATAATATC
## cg09835024 AATAAACACCAACCCCAAACCAATCTCACTTTATTAAATTACAAAAATCA 0081631976
## cg05451842 CRTTCAAATACACTATAACCCRACTAAAAAAACCCCCAACAACCCAAAAC
## AlleleB_ProbeSeq
## cg07881041
## cg18478105 AAATAAATTTCGCTCTCAAATCCCAATCTCGTACGACGAAACGAAAACCG
## cg23229610
## cg03513874
## cg09835024 AATAAACGCCGACCCCGAACCGATCTCGCTTTATTAAATTACAAAAATCG
## cg05451842
## Infinium_Design_Type Next_Base Color_Channel
## cg07881041 II
## cg18478105 I C Grn
## cg23229610 II
## cg03513874 II
## cg09835024 I A Red
## cg05451842 II
## Forward_Sequence
## cg07881041 CTGCACGCCTACTGCAGGTGCAGCACCCTCAGCCCATGTTTCATGTATTATCTCATTTAA[CG]CATGTATCATCTCATTTAATGCATGCATTATCTCATTTAATTCTCACAACCCCTCAGGTG
## cg18478105 TCCCGTCTTACGGGATGGATTTCGCTCTCAGGTCCCAGTCTCGTGCGGCGGGGCGGGGAC[CG]CAGCCGGCTGGGCGGGGAAGCCCTGAGCCGGGGAAGTCACGTGGGGCGTGTCCGGAGGCG
## cg23229610 GTTTCTGGACAGTAAAATTCTTTCCTTGAAGGACATTAGGGCCAAAATGGGCAAGGATTC[CG]AGATTGGTACATCGAGCGTTATCTTCCAACTCTCTTTTCTAAATGGGCTCATTTAGTAAT
## cg03513874 ATTGTGCCCACCTTGCTGCTGACAGTTAAGCATCACTAAAGTAGGAAATAGGGTCCAAAC[CG]ACACTACTTAAGGAGGGCAAGTAGTGATGGGACCTCATCATCCCATTGCTATCATGGAGC
## cg09835024 AGCCCCGTCATAGGTGGGCGCCGACCCCGAGCCGATCTCGCTTTATTAAATTACAGAAAT[CG]GTATTCAAAAAAAAAAAAAAAAAAGGGCGGGGAGGACACTCCCTCTTCTCTGTTCCCACA
## cg05451842 CACAGCGTGGATGCCCCGATTTCCCAGGTCCCTCCGCAACCCTCAGTAGAACTCCCACCG[CG]CCCTGGGCTGCTGGGGGCCTCCCCAGCCGGGTCACAGTGCACCTGAACGCCCCGGTCCGT
## Genome_Build CHR MAPINFO
## cg07881041 37 19 5236016
## cg18478105 37 20 61847650
## cg23229610 37 1 6841125
## cg03513874 37 2 198303466
## cg09835024 37 X 24072640
## cg05451842 37 14 93581139
## SourceSeq Strand
## cg07881041 TGCAGGTGCAGCACCCTCAGCCCATGTTTCATGTATTATCTCATTTAACG R
## cg18478105 CGGTCCCCGCCCCGCCGCACGAGACTGGGACCTGAGAGCGAAATCCATCC R
## cg23229610 CGGAATCCTTGCCCATTTTGGCCCTAATGTCCTTCAAGGAAAGAATTTTA R
## cg03513874 CAATGGGATGATGAGGTCCCATCACTACTTGCCCTCCTTAAGTAGTGTCG F
## cg09835024 GGTGGGCGCCGACCCCGAGCCGATCTCGCTTTATTAAATTACAGAAATCG R
## cg05451842 CGCCCTGGGCTGCTGGGGGCCTCCCCAGCCGGGTCACAGTGCACCTGAAC F
## UCSC_RefGene_Name UCSC_RefGene_Accession
## cg07881041 PTPRS;PTPRS;PTPRS;PTPRS NM_130855;NM_002850;NM_130854;NM_130853
## cg18478105 YTHDF1 NM_017798
## cg23229610
## cg03513874
## cg09835024 EIF2S3 NM_001415
## cg05451842 ITPK1;ITPK1;ITPK1 NM_001142593;NM_014216;NM_001142594
## UCSC_RefGene_Group UCSC_CpG_Islands_Name
## cg07881041 Body;Body;Body;Body chr19:5237294-5237669
## cg18478105 TSS200 chr20:61846843-61848103
## cg23229610 chr1:6844313-6846366
## cg03513874 chr2:198299244-198299972
## cg09835024 TSS1500 chrX:24072558-24073135
## cg05451842 Body;Body;Body chr14:93581083-93582797
## Relation_to_UCSC_CpG_Island Phantom4_Enhancers Phantom5_Enhancers
## cg07881041 N_Shore
## cg18478105 Island
## cg23229610 N_Shelf
## cg03513874 S_Shelf
## cg09835024 Island
## cg05451842 Island
## DMR X450k_Enhancer HMM_Island Regulatory_Feature_Name
## cg07881041 NA
## cg18478105 NA 20:61317142-61318498 20:61846284-61847956
## cg23229610 NA
## cg03513874 NA
## cg09835024 NA X:24071907-24073667
## cg05451842 NA 14:92650663-92652544
## Regulatory_Feature_Group GencodeBasicV12_NAME
## cg07881041
## cg18478105 Promoter_Associated YTHDF1;YTHDF1
## cg23229610
## cg03513874
## cg09835024 Promoter_Associated EIF2S3
## cg05451842 ITPK1
## GencodeBasicV12_Accession GencodeBasicV12_Group
## cg07881041
## cg18478105 ENST00000370334.4;ENST00000370339.3 TSS200;TSS200
## cg23229610
## cg03513874
## cg09835024 ENST00000253039.4 TSS200
## cg05451842 ENST00000555495.1 5'UTR
## GencodeCompV12_NAME
## cg07881041
## cg18478105 YTHDF1;YTHDF1
## cg23229610
## cg03513874
## cg09835024 EIF2S3;EIF2S3;EIF2S3
## cg05451842 ITPK1
## GencodeCompV12_Accession
## cg07881041
## cg18478105 ENST00000370334.4;ENST00000370339.3
## cg23229610
## cg03513874
## cg09835024 ENST00000487075.1;ENST00000423068.1;ENST00000253039.4
## cg05451842 ENST00000555495.1
## GencodeCompV12_Group DNase_Hypersensitivity_NAME
## cg07881041
## cg18478105 TSS200;TSS200 chr20:61847520-61847755
## cg23229610
## cg03513874
## cg09835024 TSS1500;TSS1500;TSS200 chrX:24072600-24073395
## cg05451842 5'UTR chr14:93581080-93581375
## DNase_Hypersensitivity_Evidence_Count OpenChromatin_NAME
## cg07881041 NA
## cg18478105 3
## cg23229610 NA
## cg03513874 NA
## cg09835024 3
## cg05451842 3
## OpenChromatin_Evidence_Count TFBS_NAME TFBS_Evidence_Count
## cg07881041 NA NA
## cg18478105 NA NA
## cg23229610 NA NA
## cg03513874 NA NA
## cg09835024 NA NA
## cg05451842 NA NA
## Methyl27_Loci Methyl450_Loci Chromosome_36 Coordinate_36
## cg07881041 NA TRUE 19 5187016
## cg18478105 NA TRUE 20 61318095
## cg23229610 NA TRUE 1 6763712
## cg03513874 NA TRUE 2 198011711
## cg09835024 NA TRUE X 23982561
## cg05451842 NA TRUE 14 92650892
## SNP_ID SNP_DISTANCE SNP_MinorAlleleFrequency
## cg07881041 rs187313142 18 0.000200
## cg18478105 rs549944121 5 0.001797
## cg23229610 rs545824288;rs527255711 40;12 0.000200;0.001198
## cg03513874
## cg09835024
## cg05451842 rs550745821 22 0.000200
## Random_Loci X strand Probe_rs Probe_maf CpG_rs CpG_maf SBE_rs
## cg07881041 NA NA - <NA> NA <NA> NA <NA>
## cg18478105 NA NA - <NA> NA <NA> NA <NA>
## cg23229610 NA NA - <NA> NA <NA> NA <NA>
## cg03513874 NA NA + <NA> NA <NA> NA <NA>
## cg09835024 NA NA - <NA> NA <NA> NA <NA>
## cg05451842 NA NA + <NA> NA <NA> NA <NA>
## SBE_maf CH_450_XY CH_450_Aut CH_EPIC Cross_Hyb
## cg07881041 NA No No No No
## cg18478105 NA No No No No
## cg23229610 NA No No No No
## cg03513874 NA No No No No
## cg09835024 NA No No No No
## cg05451842 NA No No No No
Age.chip_Table.annotated = merge(Age.chip_Table, EPIC_Annotation_Complete[,c("Name", "CHR", "Strand", "UCSC_RefGene_Name", "UCSC_RefGene_Group")], by.x = "CpG", by.y = "Name", all = FALSE)
colnames(Age.chip_Table.annotated)[5:6] = c("Chromosome", "Coordinate")
Age.chip_Table.annotated <- Age.chip_Table.annotated[order(Age.chip_Table.annotated$Nominal_P),]
#Grabbing the Volcano hits:
LM_Age.chip_Hits <- Age.chip_Table.annotated[which(abs(Age.chip_Table.annotated$Delta_Beta)>0.05 & Age.chip_Table.annotated$Nominal_P<5e-6),]
#Let's order by Nominal_P:
LM_Age.chip_Hits = LM_Age.chip_Hits[order(LM_Age.chip_Hits$Nominal_P),]
rownames(LM_Age.chip_Hits) = c()
str(LM_Age.chip_Hits)## 'data.frame': 9 obs. of 8 variables:
## $ CpG : Factor w/ 425456 levels "cg00000029","cg00000108",..: 246988 119392 301975 106386 345056 167079 416764 256738 137398
## $ Nominal_P : num 5.55e-07 1.30e-06 1.45e-06 2.27e-06 2.61e-06 ...
## $ FDR : num 0.154 0.154 0.154 0.162 0.162 ...
## $ Delta_Beta : num -0.199 -0.166 0.157 0.236 0.07 ...
## $ Chromosome : Factor w/ 25 levels "","1","10","11",..: 17 6 8 13 13 21 20 4 7
## $ Coordinate : Factor w/ 3 levels "","F","R": 2 2 2 3 3 3 2 2 2
## $ UCSC_RefGene_Name : Factor w/ 66070 levels "","A1BG","A1BG-AS1;A1BG",..: 53494 1 20278 54913 30522 1 1 15936 21983
## $ UCSC_RefGene_Group: Factor w/ 8044 levels "","1stExon","1stExon;1stExon",..: 4133 1 5441 1209 5441 1 1 5441 5441
head(LM_Age.chip_Hits)## CpG Nominal_P FDR Delta_Beta Chromosome Coordinate
## 1 cg15155209 5.551609e-07 0.1537017 -0.19917332 3 F
## 2 cg06968164 1.297871e-06 0.1537017 -0.16568531 13 F
## 3 cg19005438 1.445053e-06 0.1537017 0.15735573 15 F
## 4 cg06173889 2.267861e-06 0.1619278 0.23625113 2 R
## 5 cg22197050 2.610535e-06 0.1619278 0.06999899 2 R
## 6 cg09981830 2.882339e-06 0.1619278 -0.15967392 7 R
## UCSC_RefGene_Name UCSC_RefGene_Group
## 1 SLC6A6;SLC6A6 Body;Body
## 2
## 3 FOXB1 TSS1500
## 4 SOX11;SOX11 3'UTR;1stExon
## 5 LOC100132215 TSS1500
## 6
Age-associated hits: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3482848/ Non-tissue specific hits: https://epigeneticsandchromatin.biomedcentral.com/articles/10.1186/s13072-018-0191-3
#Reorder row index.
rownames(Age.chip_Table.annotated) <- NULL
#Look at hits based on candidate genes.
head(Age.chip_Table.annotated[which(Age.chip_Table.annotated$UCSC_RefGene_Name == "ELOVL2"),]) #First ELOVL2 hit == row 278.## CpG Nominal_P FDR Delta_Beta Chromosome Coordinate
## 490 cg16867657 0.0006480605 0.5222414 0.10186999 6 F
## 5912 cg24724428 0.0102957390 0.6871049 0.09904397 6 F
## 15572 cg21572722 0.0302386080 0.7669489 0.05297685 6 F
## 24589 cg01799681 0.0504981006 0.8099904 -0.06682364 6 F
## 67578 cg16323298 0.1540096275 0.9001507 -0.02915569 6 F
## 118811 cg25151806 0.2820134342 0.9380256 0.01249799 6 R
## UCSC_RefGene_Name UCSC_RefGene_Group
## 490 ELOVL2 TSS1500
## 5912 ELOVL2 TSS1500
## 15572 ELOVL2 TSS1500
## 24589 ELOVL2 Body
## 67578 ELOVL2 TSS1500
## 118811 ELOVL2 TSS1500
head(Age.chip_Table.annotated[which(Age.chip_Table.annotated$UCSC_RefGene_Name == "EDARADD"),]) #First EDARADD hit == row 111292.## CpG Nominal_P FDR Delta_Beta Chromosome Coordinate
## 48400 cg18964582 0.1069164 0.8722603 0.01701566 1 R
## UCSC_RefGene_Name UCSC_RefGene_Group
## 48400 EDARADD TSS1500
head(Age.chip_Table.annotated[which(Age.chip_Table.annotated$UCSC_RefGene_Name == "TOM1L1"),]) #First TOM1L1 hit == row 2140.## CpG Nominal_P FDR Delta_Beta Chromosome Coordinate
## 2814 cg07081054 0.00460419 0.6425326 0.063107922 17 R
## 7730 cg25431220 0.01384333 0.7057645 0.017801409 17 F
## 33207 cg05265484 0.07067246 0.8391064 0.070202235 17 F
## 54646 cg10237252 0.12227540 0.8833635 0.101841269 17 R
## 170663 cg03870845 0.41371178 0.9583453 0.008528875 17 F
## 173287 cg13913085 0.42041479 0.9591561 0.044899176 17 F
## UCSC_RefGene_Name UCSC_RefGene_Group
## 2814 TOM1L1 Body
## 7730 TOM1L1 TSS200
## 33207 TOM1L1 TSS1500
## 54646 TOM1L1 TSS1500
## 170663 TOM1L1 TSS200
## 173287 TOM1L1 Body
head(Age.chip_Table.annotated[which(Age.chip_Table.annotated$UCSC_RefGene_Name == "NPTX2"),]) #First NPTX2 hit == row 8828.## CpG Nominal_P FDR Delta_Beta Chromosome Coordinate
## 13049 cg02368096 0.02478601 0.7509686 0.06380722 7 R
## 21865 cg13585675 0.04447912 0.8019882 0.05113651 7 F
## 36700 cg13878520 0.07904831 0.8494727 0.03212259 7 R
## 42812 cg05168977 0.09365409 0.8635618 -0.04768416 7 F
## 48517 cg13695954 0.10719414 0.8723142 0.01446438 7 F
## 52482 cg13314145 0.11695986 0.8797918 0.02459323 7 R
## UCSC_RefGene_Name UCSC_RefGene_Group
## 13049 NPTX2 Body
## 21865 NPTX2 Body
## 36700 NPTX2 Body
## 42812 NPTX2 1stExon
## 48517 NPTX2 Body
## 52482 NPTX2 TSS1500
#Load all the objects.
load("~/KoborLab/kobor_space/kendrix/macular_degeneration/data/AMD_SVA_M_values.funnorm.RData")
dim(AMD_pData) #44 samples.## [1] 44 22
dim(M_values.funnorm.filt) #425456 probes.## [1] 425456 44
AMD_pData$Disease_State <- as.factor(AMD_pData$Disease_State)
AMD_pData$Sex <- as.factor(AMD_pData$Sex)
AMD_pData$Row <- as.factor(AMD_pData$Row)
AMD_pData$Chip <- as.factor(AMD_pData$Chip)
str(AMD_pData)## 'data.frame': 44 obs. of 22 variables:
## $ Sample_Name : chr "Sample 1" "Sample 10" "Sample 11" "Sample 12" ...
## $ Disease_State: Factor w/ 2 levels "age-related macular degeneration",..: 2 2 2 1 1 1 2 1 1 1 ...
## $ Sex : Factor w/ 2 levels "F","M": 2 2 2 2 2 1 2 2 1 2 ...
## $ Age : int 61 74 70 76 79 89 66 70 83 76 ...
## $ Tissue : chr "pigmented layer of retina" "pigmented layer of retina" "pigmented layer of retina" "pigmented layer of retina" ...
## $ Row : Factor w/ 12 levels "R01C01","R01C02",..: 3 10 12 1 3 5 7 9 11 2 ...
## $ Chip : Factor w/ 4 levels "200723300084",..: 2 2 2 4 4 4 4 4 4 4 ...
## $ Basename : chr "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200723300089_R02C01" "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200723300089_R05C02" "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200723300089_R06C02" "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200770460039_R01C01" ...
## $ filenames : chr "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200723300089_R02C01" "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200723300089_R05C02" "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200723300089_R06C02" "/home/BCRICWH.LAN/kendrix.kek/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/200770460039_R01C01" ...
## $ xMed : num 11.4 11.7 11.9 11.7 11.6 ...
## $ yMed : num 11.7 12 12.1 11.9 11.9 ...
## $ predictedSex : chr "M" "M" "M" "M" ...
## $ SV : num -0.1222 -0.011 -0.1074 -0.073 -0.0265 ...
## $ Epithelial : num 0.387 0.436 0.367 0.413 0.387 ...
## $ Fibroblast : num 0.326 0.316 0.298 0.24 0.312 ...
## $ B_Cell : num 0.0502 0.0454 0.0582 0.0528 0.0482 ...
## $ NK_Cell : num 0.065 0.0601 0.0816 0.0986 0.0743 ...
## $ CD4T : num 0.0478 0.0467 0.0545 0.0658 0.0544 ...
## $ CD8T : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Monocyte : num 0.0685 0.05 0.081 0.0666 0.0651 ...
## $ Neutrophil : num 0 0 0 0 0 ...
## $ Eosinophil : num 0.0555 0.0457 0.0597 0.0632 0.0597 ...
#Check order.
identical(rownames(AMD_pData), colnames(M_values.funnorm.filt)) #TRUE. ## [1] TRUE
#Sanity check - there should be no NAs or infinite numbers - which could be a result of logit transformation of 0 or 1 beta values.
all(complete.cases(M_values.funnorm.filt)) == "TRUE" #TRUE - meaning no NA or infinite numbers. ## [1] TRUE
#Load 450K manifest.
load("~/KoborLab/kobor_space/shared_coding_resource/Illumina_EPIC/EPIC_Annotation_Complete.RData")
head(EPIC_Annotation_Complete)## Name AddressA_ID
## cg07881041 cg07881041 0085713262
## cg18478105 cg18478105 0046761277
## cg23229610 cg23229610 0021717843
## cg03513874 cg03513874 0029622133
## cg09835024 cg09835024 0016745152
## cg05451842 cg05451842 0016681196
## AlleleA_ProbeSeq AddressB_ID
## cg07881041 CTACAAATACAACACCCTCAACCCATATTTCATATATTATCTCATTTAAC
## cg18478105 AAATAAATTTCACTCTCAAATCCCAATCTCATACAACAAAACAAAAACCA 0086644198
## cg23229610 ATAAAATTCTTTCCTTAAAAAACATTAAAACCAAAATAAACAAAAATTCC
## cg03513874 ACAATAAAATAATAAAATCCCATCACTACTTACCCTCCTTAAATAATATC
## cg09835024 AATAAACACCAACCCCAAACCAATCTCACTTTATTAAATTACAAAAATCA 0081631976
## cg05451842 CRTTCAAATACACTATAACCCRACTAAAAAAACCCCCAACAACCCAAAAC
## AlleleB_ProbeSeq
## cg07881041
## cg18478105 AAATAAATTTCGCTCTCAAATCCCAATCTCGTACGACGAAACGAAAACCG
## cg23229610
## cg03513874
## cg09835024 AATAAACGCCGACCCCGAACCGATCTCGCTTTATTAAATTACAAAAATCG
## cg05451842
## Infinium_Design_Type Next_Base Color_Channel
## cg07881041 II
## cg18478105 I C Grn
## cg23229610 II
## cg03513874 II
## cg09835024 I A Red
## cg05451842 II
## Forward_Sequence
## cg07881041 CTGCACGCCTACTGCAGGTGCAGCACCCTCAGCCCATGTTTCATGTATTATCTCATTTAA[CG]CATGTATCATCTCATTTAATGCATGCATTATCTCATTTAATTCTCACAACCCCTCAGGTG
## cg18478105 TCCCGTCTTACGGGATGGATTTCGCTCTCAGGTCCCAGTCTCGTGCGGCGGGGCGGGGAC[CG]CAGCCGGCTGGGCGGGGAAGCCCTGAGCCGGGGAAGTCACGTGGGGCGTGTCCGGAGGCG
## cg23229610 GTTTCTGGACAGTAAAATTCTTTCCTTGAAGGACATTAGGGCCAAAATGGGCAAGGATTC[CG]AGATTGGTACATCGAGCGTTATCTTCCAACTCTCTTTTCTAAATGGGCTCATTTAGTAAT
## cg03513874 ATTGTGCCCACCTTGCTGCTGACAGTTAAGCATCACTAAAGTAGGAAATAGGGTCCAAAC[CG]ACACTACTTAAGGAGGGCAAGTAGTGATGGGACCTCATCATCCCATTGCTATCATGGAGC
## cg09835024 AGCCCCGTCATAGGTGGGCGCCGACCCCGAGCCGATCTCGCTTTATTAAATTACAGAAAT[CG]GTATTCAAAAAAAAAAAAAAAAAAGGGCGGGGAGGACACTCCCTCTTCTCTGTTCCCACA
## cg05451842 CACAGCGTGGATGCCCCGATTTCCCAGGTCCCTCCGCAACCCTCAGTAGAACTCCCACCG[CG]CCCTGGGCTGCTGGGGGCCTCCCCAGCCGGGTCACAGTGCACCTGAACGCCCCGGTCCGT
## Genome_Build CHR MAPINFO
## cg07881041 37 19 5236016
## cg18478105 37 20 61847650
## cg23229610 37 1 6841125
## cg03513874 37 2 198303466
## cg09835024 37 X 24072640
## cg05451842 37 14 93581139
## SourceSeq Strand
## cg07881041 TGCAGGTGCAGCACCCTCAGCCCATGTTTCATGTATTATCTCATTTAACG R
## cg18478105 CGGTCCCCGCCCCGCCGCACGAGACTGGGACCTGAGAGCGAAATCCATCC R
## cg23229610 CGGAATCCTTGCCCATTTTGGCCCTAATGTCCTTCAAGGAAAGAATTTTA R
## cg03513874 CAATGGGATGATGAGGTCCCATCACTACTTGCCCTCCTTAAGTAGTGTCG F
## cg09835024 GGTGGGCGCCGACCCCGAGCCGATCTCGCTTTATTAAATTACAGAAATCG R
## cg05451842 CGCCCTGGGCTGCTGGGGGCCTCCCCAGCCGGGTCACAGTGCACCTGAAC F
## UCSC_RefGene_Name UCSC_RefGene_Accession
## cg07881041 PTPRS;PTPRS;PTPRS;PTPRS NM_130855;NM_002850;NM_130854;NM_130853
## cg18478105 YTHDF1 NM_017798
## cg23229610
## cg03513874
## cg09835024 EIF2S3 NM_001415
## cg05451842 ITPK1;ITPK1;ITPK1 NM_001142593;NM_014216;NM_001142594
## UCSC_RefGene_Group UCSC_CpG_Islands_Name
## cg07881041 Body;Body;Body;Body chr19:5237294-5237669
## cg18478105 TSS200 chr20:61846843-61848103
## cg23229610 chr1:6844313-6846366
## cg03513874 chr2:198299244-198299972
## cg09835024 TSS1500 chrX:24072558-24073135
## cg05451842 Body;Body;Body chr14:93581083-93582797
## Relation_to_UCSC_CpG_Island Phantom4_Enhancers Phantom5_Enhancers
## cg07881041 N_Shore
## cg18478105 Island
## cg23229610 N_Shelf
## cg03513874 S_Shelf
## cg09835024 Island
## cg05451842 Island
## DMR X450k_Enhancer HMM_Island Regulatory_Feature_Name
## cg07881041 NA
## cg18478105 NA 20:61317142-61318498 20:61846284-61847956
## cg23229610 NA
## cg03513874 NA
## cg09835024 NA X:24071907-24073667
## cg05451842 NA 14:92650663-92652544
## Regulatory_Feature_Group GencodeBasicV12_NAME
## cg07881041
## cg18478105 Promoter_Associated YTHDF1;YTHDF1
## cg23229610
## cg03513874
## cg09835024 Promoter_Associated EIF2S3
## cg05451842 ITPK1
## GencodeBasicV12_Accession GencodeBasicV12_Group
## cg07881041
## cg18478105 ENST00000370334.4;ENST00000370339.3 TSS200;TSS200
## cg23229610
## cg03513874
## cg09835024 ENST00000253039.4 TSS200
## cg05451842 ENST00000555495.1 5'UTR
## GencodeCompV12_NAME
## cg07881041
## cg18478105 YTHDF1;YTHDF1
## cg23229610
## cg03513874
## cg09835024 EIF2S3;EIF2S3;EIF2S3
## cg05451842 ITPK1
## GencodeCompV12_Accession
## cg07881041
## cg18478105 ENST00000370334.4;ENST00000370339.3
## cg23229610
## cg03513874
## cg09835024 ENST00000487075.1;ENST00000423068.1;ENST00000253039.4
## cg05451842 ENST00000555495.1
## GencodeCompV12_Group DNase_Hypersensitivity_NAME
## cg07881041
## cg18478105 TSS200;TSS200 chr20:61847520-61847755
## cg23229610
## cg03513874
## cg09835024 TSS1500;TSS1500;TSS200 chrX:24072600-24073395
## cg05451842 5'UTR chr14:93581080-93581375
## DNase_Hypersensitivity_Evidence_Count OpenChromatin_NAME
## cg07881041 NA
## cg18478105 3
## cg23229610 NA
## cg03513874 NA
## cg09835024 3
## cg05451842 3
## OpenChromatin_Evidence_Count TFBS_NAME TFBS_Evidence_Count
## cg07881041 NA NA
## cg18478105 NA NA
## cg23229610 NA NA
## cg03513874 NA NA
## cg09835024 NA NA
## cg05451842 NA NA
## Methyl27_Loci Methyl450_Loci Chromosome_36 Coordinate_36
## cg07881041 NA TRUE 19 5187016
## cg18478105 NA TRUE 20 61318095
## cg23229610 NA TRUE 1 6763712
## cg03513874 NA TRUE 2 198011711
## cg09835024 NA TRUE X 23982561
## cg05451842 NA TRUE 14 92650892
## SNP_ID SNP_DISTANCE SNP_MinorAlleleFrequency
## cg07881041 rs187313142 18 0.000200
## cg18478105 rs549944121 5 0.001797
## cg23229610 rs545824288;rs527255711 40;12 0.000200;0.001198
## cg03513874
## cg09835024
## cg05451842 rs550745821 22 0.000200
## Random_Loci X strand Probe_rs Probe_maf CpG_rs CpG_maf SBE_rs
## cg07881041 NA NA - <NA> NA <NA> NA <NA>
## cg18478105 NA NA - <NA> NA <NA> NA <NA>
## cg23229610 NA NA - <NA> NA <NA> NA <NA>
## cg03513874 NA NA + <NA> NA <NA> NA <NA>
## cg09835024 NA NA - <NA> NA <NA> NA <NA>
## cg05451842 NA NA + <NA> NA <NA> NA <NA>
## SBE_maf CH_450_XY CH_450_Aut CH_EPIC Cross_Hyb
## cg07881041 NA No No No No
## cg18478105 NA No No No No
## cg23229610 NA No No No No
## cg03513874 NA No No No No
## cg09835024 NA No No No No
## cg05451842 NA No No No No
#Pull out ELOVL2-specific CpG sites.
ELOVL2 <- EPIC_Annotation_Complete[which(EPIC_Annotation_Complete$UCSC_RefGene_Name == "ELOVL2"),]
#Subset ELOVL2-specific CpG sites.
M_values.funnorm.filt <- M_values.funnorm.filt[which(rownames(M_values.funnorm.filt) %in% rownames(ELOVL2)),]library(pbapply) #Progress bar for apply functions.## Warning: package 'pbapply' was built under R version 3.6.3
#EWAS on Age - All samples.
#LM: Need to use transformed M-values instead of beta values as it is more statistically sound.
Age.candidate_LM_pval <- pbsapply(1:nrow(M_values.funnorm.filt), function(CpG){
meta <- AMD_pData
meta$Mval <- M_values.funnorm.filt[CpG,]
mod_Age.candidate.row <- lm(Mval ~ Age + SV, data = meta)
coef(summary(mod_Age.candidate.row))[2,4]}) #Returns nominal p-value for Age for model at each CpG.
head(Age.candidate_LM_pval)## [1] 8.842870e-01 8.694883e-01 2.590172e-02 4.282323e-01 1.398169e-01
## [6] 3.463246e-05
#Inspect p-value distribution for model.
pvalue_dist_Age.candidate <- data.frame(CpG = rownames(M_values.funnorm.filt), Nominal_P = Age.candidate_LM_pval)
qqnorm(pvalue_dist_Age.candidate$Nominal_P)#Right skewed Q-Q plot.This is the site with information pertaining to the Hovarth clock input. Note that the site runs all three clocks: Horvath, Hannum and PhenoAge clocks (among other clocks - refer to Horvath’s manual (Location: ~KoborLab/kobor_space/kendrix/macular_degeneration/ManualEpigeneticClock3.pdf)) simultaneously with the same input.
Input: Data frame of betas from noob-normalised MSet object with Sample IDs as columns.
#Horvath data prep function.
horvathPrep <- function(dat){
dat <- as.data.frame(dat)
dat0= cbind(rownames(dat), dat) #need col 1 to be CpG probe ID
datMiniAnnotation=read.csv("~/KoborLab/kobor_space/shared_coding_resource/Horvath_Age_Prediction/datMiniAnnotation3.csv") #get from horvath website above
match1=match(datMiniAnnotation[,1], dat0[,1] )
dat0Reduced=dat0[match1,]
dat0Reduced[,1]=as.character(dat0Reduced[,1])
dat0Reduced[is.na(match1),1]= as.character(datMiniAnnotation[is.na(match1),1])
datout=data.frame(dat0Reduced)
# make sure you output numeric variables...
for (i in 2:dim(datout)[[2]] ){datout[,i]= as.numeric(as.character(gsub(x=datout[,i],pattern="\"",replacement=""))) }
colnames(datout)[1] <- "Probe"
return(datout)}
#Prepare methylation data for Horvath clock.
betas.noob <- getBeta(AMD_MSet.noob)
AMD_betas.Horvath <- horvathPrep(betas.noob)
#Match colnames to AMD_samplesheet.
colnames(AMD_betas.Horvath) <- gsub("\\.", " ", colnames(AMD_betas.Horvath))
identical(colnames(AMD_betas.Horvath)[2:45], sampleNames(AMD_ExtendedRGSet)) #Sample order matches.
write.table(AMD_betas.Horvath,"~/KoborLab/kobor_space/kendrix/macular_degeneration/data/AMD_betas.Horvath.csv", row.names = F, sep = "," )For ‘Advanced Blood Analysis’ (more columns in output including Hannum & Weidner clocks, as well as cell type estimate, etc. you need to create an annotation file to be used with calculating epigenetic age. We need three variables as well as the first column representing the sample names. - Column 1: Sample ID which contains a list of the same ordered names as the methylation file - Column 2: Chronological age as an integer - Column 3: Titled “Female”, male = 0 and female = 1. This is used to validate gender labels - Column 4: “Tissue”, from Horvath’s PDF tutorial.
#Make sample file for Advanced Analysis in Blood.
Sample_Annotation <- pData(AMD_MSet.noob)
head(Sample_Annotation)
Sample_Annotation <- Sample_Annotation[, c("Sample_Name", "Age", "Sex", "Tissue")]
Sample_Annotation$Sex <- gsub("F", "1", gsub("M", "0", Sample_Annotation$Sex))
colnames(Sample_Annotation)[3] <- "Female"
str(Sample_Annotation)identical(rownames(Sample_Annotation), colnames(AMD_betas.Horvath)[2:45]) #TRUE.## [1] TRUE
write.table(Sample_Annotation,"~/KoborLab/kobor_space/kendrix/macular_degeneration/data/Sample_Annotation_Horvath.csv", row.names = F, sep = "," )After the epigenetic ages are calculated and the output is obtained from the Horvath team, upload the output to the server to be visualised.
#Load necessary objeects.
load("~/KoborLab/kobor_space/kendrix/macular_degeneration/data/AMD_ExtendedRGSet.RData")
load("~/KoborLab/kobor_space/kendrix/macular_degeneration/data/AMD_MSet.noob.RData")
AMD_samplesheet <- read.csv("~/KoborLab/kobor_space/kendrix/macular_degeneration/data/idats/AMD_project/AMD_samplesheet.csv", header = TRUE)AMD_Epigenetic_Age <- read.csv(file = "~/KoborLab/kobor_space/kendrix/macular_degeneration/data/AMD_betas.output.csv", row.names = 1)
rownames(AMD_Epigenetic_Age) <- AMD_Epigenetic_Age$Sample_Name
#Add Epigenetic Age information to RGSet.
identical(rownames(pData(AMD_ExtendedRGSet)), rownames(AMD_Epigenetic_Age))## [1] TRUE
AMD_pData <- as.data.frame(pData(AMD_ExtendedRGSet))
AMD_pData <- join(AMD_pData, AMD_Epigenetic_Age, by = "Sample_Name")
rownames(AMD_pData) <- AMD_pData$Sample_Name
identical(rownames(pData(AMD_ExtendedRGSet)), rownames(AMD_pData)) #Check to make sure the order of the samples that are merged is correct.## [1] TRUE
The Horvath clock is a pan-tissue epigenetic clock that will predict the DNAm age of tissues based on 353 CpG sites that were developed from 8,000 samples from 82 datasets.
Note that even though the Horvath clock was trained using multiple tissues, none of the tissues are retina tissues that are in this dataset.
Error: +/- 3.6 years.
#Subset relevant epigenetic age variables.
AMD_Epigenetic_Age_Horvath <- AMD_pData[, c("Sample_Name", "Sample_Group", "Array", "Slide", "Sex", "predictedGender", "Tissue", "predictedTissue", "Age", "DNAmAge", "AgeAccelerationDiff", "AgeAccelerationResidual")]
#Create function to plot linear regression.
ggplotRegression <- function (fit) {
require(ggplot2)
ggplot(fit$model, aes_string(x = names(fit$model)[2], y = names(fit$model)[1])) +
geom_point(aes(colour = AMD_Epigenetic_Age_Horvath$Sex, shape = AMD_Epigenetic_Age_Horvath$Sample_Group)) +
guides(size = FALSE) +
stat_smooth(method = "lm", col = "red")
}
#Regression plot of Horvath predicted DNAmAge against reported age:
AMD_HO_plot <- lm(DNAmAge ~ Age, data = AMD_Epigenetic_Age_Horvath)
ggplotRegression(AMD_HO_plot) + labs(x = "Reported age", y = "Horvath predicted age", caption = "Comparing reported age and predicted age using Horvath epigenetic clock") +
scale_colour_manual(name = "Sex", values = c("#990000", "#2c7dab"), labels = c("Female", "Male")) +
scale_shape_manual(name = "Disease State", values = c(20, 23), labels = c("AMD", "Normal")) +
stat_cor(method = "spearman") +
theme_bw() + theme(panel.border = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.line = element_line(colour = "black"),
axis.text = element_text(),
axis.title = element_text(size = 8),
plot.caption = element_text(hjust = 0.5, size = rel(1)))## `geom_smooth()` using formula 'y ~ x'
#Plot of overall disease state vs control:
AMD_Epigenetic_Age_Horvath$Sample_Group <- factor(AMD_Epigenetic_Age_Horvath$Sample_Group)
ggplot(data = AMD_Epigenetic_Age_Horvath,
aes(x = Sample_Group, y = AgeAccelerationResidual, fill = Sample_Group)) +
geom_boxplot(position = position_dodge(0.7), width = 0.5, outlier.shape = NA) +
geom_jitter(aes(fill = Sample_Group), size = 2, shape = 21, alpha = 0.7,
position = position_jitterdodge(dodge.width = 0.75, jitter.width = 0.3)) +
xlab("Disease State") + ylab("Horvath Age Acceleration Residual") +
scale_x_discrete(breaks = c("age-related macular degeneration","normal"),
labels = c("Age-Related Macular Degeneration", "Normal")) +
scale_fill_manual(values = c("#77284e", "#ffffff")) +
theme_classic() + theme(
panel.border = element_blank(),
panel.background = element_blank(),
plot.title = element_text(size = 14, family = "Tahoma", face = "bold"),
text = element_text(family = "Tahoma"),
axis.title = element_text(face = "bold"),
axis.text.x = element_text(colour = "black", size = 11),
axis.text.y = element_text(colour = "black", size = 9),
axis.line = element_line(size = 0.5, colour = "black"),
legend.position = "none")AMD <- subset(AMD_Epigenetic_Age_Horvath, Sample_Group == "age-related macular degeneration")
normal <- subset(AMD_Epigenetic_Age_Horvath, Sample_Group == "normal")
var.test(AMD$AgeAccelerationResidual, normal$AgeAccelerationResidual)##
## F test to compare two variances
##
## data: AMD$AgeAccelerationResidual and normal$AgeAccelerationResidual
## F = 1.5411, num df = 24, denom df = 18, p-value = 0.35
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
## 0.6157627 3.6443110
## sample estimates:
## ratio of variances
## 1.541067
#Plot of disease state vs control that is stratified by sex:
AMD_Epigenetic_Age_Horvath$Sex <- factor(AMD_Epigenetic_Age_Horvath$Sex)
ggplot(data = AMD_Epigenetic_Age_Horvath,
aes(x = Sample_Group, y = AgeAccelerationResidual, fill = Sex)) +
geom_boxplot(position = position_dodge(0.7), width = 0.5, outlier.shape = NA) +
geom_jitter(aes(fill = Sex), size = 2, shape = 21, alpha = 0.7,
position = position_jitterdodge(dodge.width = 0.75, jitter.width = 0.3)) +
xlab("Disease State") + ylab("Horvath Age Acceleration Residual") +
scale_x_discrete(breaks = c("age-related macular degeneration","normal"),
labels = c("AMD", "Normal")) +
scale_fill_manual(values = c("#990000", "#2c7dab")) +
theme_classic() + theme(
panel.border = element_blank(),
panel.background = element_blank(),
plot.title = element_text(size = 14, family = "Tahoma", face = "bold"),
text = element_text(family = "Tahoma"),
axis.title = element_text(face = "bold"),
axis.text.x = element_text(colour = "black", size = 11),
axis.text.y = element_text(colour = "black", size = 9),
axis.line = element_line(size = 0.5, colour = "black"))#Calculate mean and difference for Bland-Altman plot.
AMD_Epigenetic_Age_Horvath$Mean <- (AMD_Epigenetic_Age_Horvath$DNAmAge + AMD_Epigenetic_Age_Horvath$Age)/2
AMD_Epigenetic_Age_Horvath$Difference <- AMD_Epigenetic_Age_Horvath$DNAmAge - AMD_Epigenetic_Age_Horvath$Age
ggplot(AMD_Epigenetic_Age_Horvath, aes(Mean, Difference)) + geom_point(shape = 0, size = 3) + ylim(-60, 25) + xlab("Mean (years)") + ylab("Difference (years)") + geom_hline(yintercept = 14.76577) + geom_hline(yintercept = -14.76577) + theme_classic()#Determine statistical significance between disease state vs control with ANOVA.
summary(lm(DNAmAge ~ Sample_Group + Sex + Sex*Sample_Group, data = AMD_Epigenetic_Age_Horvath))##
## Call:
## lm(formula = DNAmAge ~ Sample_Group + Sex + Sex * Sample_Group,
## data = AMD_Epigenetic_Age_Horvath)
##
## Residuals:
## Min 1Q Median 3Q Max
## -16.2281 -3.4734 0.5236 4.7642 16.6614
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 33.0360 2.3430 14.100 <2e-16 ***
## Sample_Groupnormal 0.6401 3.6514 0.175 0.862
## SexM 1.0628 3.0248 0.351 0.727
## Sample_Groupnormal:SexM -3.1902 4.6440 -0.687 0.496
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.409 on 40 degrees of freedom
## Multiple R-squared: 0.02061, Adjusted R-squared: -0.05285
## F-statistic: 0.2805 on 3 and 40 DF, p-value: 0.8391
summary(aov(DNAmAge ~ Sample_Group + Sex + Sex*Sample_Group, data = AMD_Epigenetic_Age_Horvath))## Df Sum Sq Mean Sq F value Pr(>F)
## Sample_Group 1 19.4 19.42 0.354 0.555
## Sex 1 0.9 0.88 0.016 0.900
## Sample_Group:Sex 1 25.9 25.91 0.472 0.496
## Residuals 40 2195.9 54.90
summary(lm(AgeAccelerationResidual ~ Sample_Group + Sex + Sex*Sample_Group, data = AMD_Epigenetic_Age_Horvath))##
## Call:
## lm(formula = AgeAccelerationResidual ~ Sample_Group + Sex + Sex *
## Sample_Group, data = AMD_Epigenetic_Age_Horvath)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.9235 -3.3887 -0.6178 4.1704 13.0694
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.6176 1.9490 -0.830 0.411
## Sample_Groupnormal 0.3858 3.0374 0.127 0.900
## SexM 3.6446 2.5162 1.448 0.155
## Sample_Groupnormal:SexM -2.8798 3.8631 -0.745 0.460
##
## Residual standard error: 6.163 on 40 degrees of freedom
## Multiple R-squared: 0.06235, Adjusted R-squared: -0.007979
## F-statistic: 0.8865 on 3 and 40 DF, p-value: 0.4563
summary(aov(AgeAccelerationResidual ~ Sample_Group + Sex + Sex*Sample_Group, data = AMD_Epigenetic_Age_Horvath))## Df Sum Sq Mean Sq F value Pr(>F)
## Sample_Group 1 18.7 18.75 0.494 0.486
## Sex 1 61.2 61.17 1.610 0.212
## Sample_Group:Sex 1 21.1 21.11 0.556 0.460
## Residuals 40 1519.5 37.99
#Determine the statistical significance male AMD vs. normal.
Horvath_males <- subset(AMD_Epigenetic_Age_Horvath, Sex == "M")
#Preliminary checks before performing t-test.
#Test for normality:
#The Shapiro-Wilk test to check if the data of interest is normality distribution.
#Null hypothesis = The data are normally distribution.
with(Horvath_males, shapiro.test(AgeAccelerationResidual[Sample_Group == "age-related macular degeneration"])) #p-value = 0.2265 which is greater than 0.05, implying the distribution of the data is not significantly different from normal distribution. In other words, assume normality.##
## Shapiro-Wilk normality test
##
## data: AgeAccelerationResidual[Sample_Group == "age-related macular degeneration"]
## W = 0.92462, p-value = 0.2265
with(Horvath_males, shapiro.test(AgeAccelerationResidual[Sample_Group == "normal"])) #p-value = 0.182 which is greater than 0.05, implying the distribution of the data is not significantly different from normal distribution. In other words, assume normality.##
## Shapiro-Wilk normality test
##
## data: AgeAccelerationResidual[Sample_Group == "normal"]
## W = 0.90463, p-value = 0.182
#Test for distribution of variances.
#Use F-test to test for homogeneity of variances.
var.test(AgeAccelerationResidual ~ Sample_Group, data = Horvath_males) #p-value = 0.6034, which is greater than 0.05, implying no significant difference between the variances of both sets of data. ##
## F test to compare two variances
##
## data: AgeAccelerationResidual by Sample_Group
## F = 1.3745, num df = 14, denom df = 11, p-value = 0.6034
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
## 0.4092129 4.2534156
## sample estimates:
## ratio of variances
## 1.374468
#Since both sets of data show normality and equality between variances, we can do classic t-test that requires those assumptions of normality and variance.
t.test(AgeAccelerationResidual ~ Sample_Group, data = Horvath_males, var.equal = TRUE)##
## Two Sample t-test
##
## data: AgeAccelerationResidual by Sample_Group
## t = 1.0394, df = 25, p-value = 0.3086
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -2.447938 7.435825
## sample estimates:
## mean in group age-related macular degeneration
## 2.0269082
## mean in group normal
## -0.4670351
#p-value = 0.3086, which is greater than 0.05. Therefore, the age-acceleration residual in AMD male is not statistically more signficant as compared to normal male.library(effects)## Loading required package: carData
## Warning: package 'carData' was built under R version 3.6.3
## Use the command
## lattice::trellis.par.set(effectsTheme())
## to customize lattice options for effects plots.
## See ?effectsTheme for details.
LM_Horvath_Residual <- lm(AgeAccelerationResidual ~ Sample_Group + Sex + Sex*Sample_Group, data = AMD_Epigenetic_Age_Horvath)
#Look at the interaction effects.
Disease_Sex_Interact <- as.data.frame(effect("Sample_Group*Sex", LM_Horvath_Residual, se = TRUE))
#Interaction plot.
ggplot(Disease_Sex_Interact, aes(x = Sample_Group, y = fit, group = Sex)) +
geom_line(size = 2, aes(color = Sex)) +
geom_ribbon(aes(ymin = fit-se, ymax = fit+se, fill = Sex),alpha = .2) +
scale_color_manual(values = c("#990000", "#2c7dab")) +
ylab("Horvath Age Acceleration Residual") + xlab("Disease State") +
theme_bw() +
theme(text = element_text(size = 12),
legend.text = element_text(size = 12),
legend.direction = "horizontal",
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
legend.position = "top")DNAm age acceleration residual data is winsorized (i.e. by shrinking outlying observations to the border of the main part of the data) before plotting.
#Windsorize age acceleration data.
AMD_Epigenetic_Age_Horvath <- AMD_Epigenetic_Age_Horvath %>% mutate(Winz_AgeAccelResidual = winsorize(AgeAccelerationResidual))#Plot of overall disease state vs control:
AMD_Epigenetic_Age_Horvath$Sample_Group <- factor(AMD_Epigenetic_Age_Horvath$Sample_Group)
ggplot(data = AMD_Epigenetic_Age_Horvath,
aes(x = Sample_Group, y = Winz_AgeAccelResidual, fill = Sample_Group)) +
geom_boxplot(position = position_dodge(0.7), width = 0.5, outlier.shape = NA) +
geom_jitter(aes(fill = Sample_Group), size = 2, shape = 21, alpha = 0.7,
position = position_jitterdodge(dodge.width = 0.75, jitter.width = 0.3)) +
xlab("Disease State") + ylab("Winsorized Horvath Age Acceleration Residual") +
scale_x_discrete(breaks = c("age-related macular degeneration","normal"),
labels = c("Age-Related Macular Degeneration", "Normal")) +
scale_fill_manual(values = c("#77284e", "#ffffff")) +
theme_classic() + theme(
panel.border = element_blank(),
panel.background = element_blank(),
plot.title = element_text(size = 14, family = "Tahoma", face = "bold"),
text = element_text(family = "Tahoma"),
axis.title = element_text(face = "bold"),
axis.text.x = element_text(colour = "black", size = 11),
axis.text.y = element_text(colour = "black", size = 9),
axis.line = element_line(size = 0.5, colour = "black"),
legend.position = "none")#Plot of disease state vs control that is stratified by sex:
AMD_Epigenetic_Age_Horvath$Sex <- factor(AMD_Epigenetic_Age_Horvath$Sex)
ggplot(data = AMD_Epigenetic_Age_Horvath,
aes(x = Sample_Group, y = Winz_AgeAccelResidual, fill = Sex)) +
geom_boxplot(position = position_dodge(0.7), width = 0.5, outlier.shape = NA) +
geom_jitter(aes(fill = Sex), size = 2, shape = 21, alpha = 0.7,
position = position_jitterdodge(dodge.width = 0.75, jitter.width = 0.3)) +
xlab("Disease State") + ylab("Winsorized Horvath Age Accel Residual") +
scale_x_discrete(breaks = c("age-related macular degeneration","normal"),
labels = c("AMD", "Normal")) +
scale_fill_manual(values = c("#990000", "#2c7dab")) +
theme_classic() + theme(
panel.border = element_blank(),
panel.background = element_blank(),
plot.title = element_text(size = 14, family = "Tahoma", face = "bold"),
text = element_text(family = "Tahoma"),
axis.title = element_text(face = "bold"),
axis.text.x = element_text(colour = "black", size = 11),
axis.text.y = element_text(colour = "black", size = 9),
axis.line = element_line(size = 0.5, colour = "black"))#Determine statistical significance between disease state vs control that is stratified by sex with ANOVA.
#Compute the analysis of variance
summary(lm(Winz_AgeAccelResidual ~ Sample_Group + Sex + Sex*Sample_Group, data = AMD_Epigenetic_Age_Horvath))##
## Call:
## lm(formula = Winz_AgeAccelResidual ~ Sample_Group + Sex + Sex *
## Sample_Group, data = AMD_Epigenetic_Age_Horvath)
##
## Residuals:
## Min 1Q Median 3Q Max
## -12.1470 -3.2248 -0.4513 3.9223 9.0518
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.5582 1.7825 -0.874 0.387
## Sample_Groupnormal 0.3264 2.7778 0.117 0.907
## SexM 3.3468 2.3012 1.454 0.154
## Sample_Groupnormal:SexM -2.2711 3.5330 -0.643 0.524
##
## Residual standard error: 5.637 on 40 degrees of freedom
## Multiple R-squared: 0.06142, Adjusted R-squared: -0.008974
## F-statistic: 0.8725 on 3 and 40 DF, p-value: 0.4634
summary(aov(Winz_AgeAccelResidual ~ Sample_Group + Sex + Sex*Sample_Group, data = AMD_Epigenetic_Age_Horvath))## Df Sum Sq Mean Sq F value Pr(>F)
## Sample_Group 1 10.8 10.85 0.341 0.562
## Sex 1 59.2 59.19 1.863 0.180
## Sample_Group:Sex 1 13.1 13.13 0.413 0.524
## Residuals 40 1270.9 31.77
Built from the whole blood of 656 human individuals, aged 19 to 101, this clock, developed by Hannum et al. looks at 71 CpG sites that are highly predictive of age.
#Subset relevant epigenetic age variables.
AMD_Epigenetic_Age_Hannum <- AMD_pData[, c("Sample_Name", "Sample_Group", "Array", "Slide", "Sex", "predictedGender", "Tissue", "predictedTissue", "Age", "DNAmAgeHannum", "AgeAccelerationResidualHannum")]
#Create function to plot linear regression.
ggplotRegression <- function (fit) {
require(ggplot2)
ggplot(fit$model, aes_string(x = names(fit$model)[2], y = names(fit$model)[1])) +
geom_point(aes(colour = AMD_Epigenetic_Age_Hannum$Sex, shape = AMD_Epigenetic_Age_Hannum$Sample_Group)) +
stat_smooth(method = "lm", col = "red")
}
#Regression plot of Hannum predicted DNAmAge against reported age:
AMD_HA_plot <- lm(DNAmAgeHannum ~ Age, data = AMD_Epigenetic_Age_Hannum)
ggplotRegression(AMD_HA_plot) + labs(x = "Reported age", y = "Hannum predicted age", caption = "Comparing reported age and predicted age using Hannum epigenetic clock") +
scale_colour_manual(name = "Sex", values = c("#990000", "#2c7dab")) +
scale_shape_manual(name = "Disease State", values = c(20, 23), labels = c("AMD", "Normal")) +
stat_cor(method = "spearman") +
theme_bw() + theme(panel.border = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.line = element_line(colour = "black"),
axis.text = element_text(),
axis.title = element_text(size = 8),
plot.caption = element_text(hjust = 0.5, size = rel(1)))## `geom_smooth()` using formula 'y ~ x'
#Plot of overall disease state vs control:
AMD_Epigenetic_Age_Hannum$Sample_Group <- factor(AMD_Epigenetic_Age_Hannum$Sample_Group)
ggplot(data = AMD_Epigenetic_Age_Hannum,
aes(x = Sample_Group, y = AgeAccelerationResidualHannum, fill = Sample_Group)) +
geom_boxplot(position = position_dodge(0.7), width = 0.5, outlier.shape = NA) +
geom_jitter(aes(fill = Sample_Group), size = 2, shape = 21, alpha = 0.7,
position = position_jitterdodge(dodge.width = 0.75, jitter.width = 0.3)) +
xlab("Disease State") + ylab("Hannum Age Acceleration Residual") +
scale_x_discrete(breaks = c("age-related macular degeneration","normal"),
labels = c("Age-Related Macular Degeneration", "Normal")) +
scale_fill_manual(values = c("#77284e", "#ffffff")) +
theme_classic() + theme(
panel.border = element_blank(),
panel.background = element_blank(),
plot.title = element_text(size = 14, family = "Tahoma", face = "bold"),
text = element_text(family = "Tahoma"),
axis.title = element_text(face = "bold"),
axis.text.x = element_text(colour = "black", size = 11),
axis.text.y = element_text(colour = "black", size = 9),
axis.line = element_line(size = 0.5, colour = "black"),
legend.position = "none")#Plot of disease state vs control that is stratified by sex:
AMD_Epigenetic_Age_Hannum$Sex <- factor(AMD_Epigenetic_Age_Hannum$Sex)
ggplot(data = AMD_Epigenetic_Age_Hannum,
aes(x = Sample_Group, y = AgeAccelerationResidualHannum, fill = Sex)) +
geom_boxplot(position = position_dodge(0.7), width = 0.5, outlier.shape = NA) +
geom_jitter(aes(fill = Sex), size = 1, shape = 21, alpha = 0.7,
position = position_jitterdodge(dodge.width = 0.75, jitter.width = 0.3)) +
xlab("Disease State") + ylab("Hannum Age Acceleration Residual") +
scale_x_discrete(breaks = c("age-related macular degeneration","normal"),
labels = c("AMD", "Normal")) +
scale_fill_manual(values = c("#990000", "#2c7dab")) +
theme_classic() + theme(
panel.border = element_blank(),
panel.background = element_blank(),
plot.title = element_text(size = 14, family = "Tahoma", face = "bold"),
text = element_text(family = "Tahoma"),
axis.title = element_text(face = "bold"),
axis.text.x = element_text(colour = "black", size = 11),
axis.text.y = element_text(colour = "black", size = 9),
axis.line = element_line(size = 0.5, colour = "black"))#Determine statistical significance between disease state vs control with ANOVA.
summary(lm(AgeAccelerationResidualHannum ~ Sample_Group + Sex + Sex*Sample_Group, data = AMD_Epigenetic_Age_Hannum))##
## Call:
## lm(formula = AgeAccelerationResidualHannum ~ Sample_Group + Sex +
## Sex * Sample_Group, data = AMD_Epigenetic_Age_Hannum)
##
## Residuals:
## Min 1Q Median 3Q Max
## -12.807 -3.666 0.238 3.789 12.035
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.9414 1.8490 -1.591 0.1195
## Sample_Groupnormal -0.5568 2.8815 -0.193 0.8478
## SexM 5.8651 2.3871 2.457 0.0184 *
## Sample_Groupnormal:SexM -1.5297 3.6649 -0.417 0.6786
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.847 on 40 degrees of freedom
## Multiple R-squared: 0.1842, Adjusted R-squared: 0.123
## F-statistic: 3.011 on 3 and 40 DF, p-value: 0.04125
summary(aov(AgeAccelerationResidualHannum ~ Sample_Group + Sex + Sex*Sample_Group, data = AMD_Epigenetic_Age_Hannum))## Df Sum Sq Mean Sq F value Pr(>F)
## Sample_Group 1 19.3 19.32 0.565 0.45662
## Sex 1 283.5 283.54 8.293 0.00636 **
## Sample_Group:Sex 1 6.0 5.96 0.174 0.67862
## Residuals 40 1367.6 34.19
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
DNAm age acceleration residual data is winsorized (i.e. by shrinking outlying observations to the border of the main part of the data) before plotting.
#Windsorize age acceleration data.
AMD_Epigenetic_Age_Hannum <- AMD_Epigenetic_Age_Hannum %>% mutate(Winz_AgeAccelResidualHannum = winsorize(AgeAccelerationResidualHannum))#Plot of overall disease state vs control:
AMD_Epigenetic_Age_Hannum$Sample_Group <- factor(AMD_Epigenetic_Age_Hannum$Sample_Group)
ggplot(data = AMD_Epigenetic_Age_Hannum,
aes(x = Sample_Group, y = Winz_AgeAccelResidualHannum, fill = Sample_Group)) +
geom_boxplot(position = position_dodge(0.7), width = 0.5, outlier.shape = NA) +
geom_jitter(aes(fill = Sample_Group), size = 2, shape = 21, alpha = 0.7,
position = position_jitterdodge(dodge.width = 0.75, jitter.width = 0.3)) +
xlab("Disease State") + ylab("Winsorized Hannum Age Acceleration Residual") +
scale_x_discrete(breaks = c("age-related macular degeneration","normal"),
labels = c("Age-Related Macular Degeneration", "Normal")) +
scale_fill_manual(values = c("#77284e", "#ffffff")) +
theme_classic() + theme(
panel.border = element_blank(),
panel.background = element_blank(),
plot.title = element_text(size = 14, family = "Tahoma", face = "bold"),
text = element_text(family = "Tahoma"),
axis.title = element_text(face = "bold"),
axis.text.x = element_text(colour = "black", size = 11),
axis.text.y = element_text(colour = "black", size = 9),
axis.line = element_line(size = 0.5, colour = "black"),
legend.position = "none")#Plot of disease state vs control that is stratified by sex:
AMD_Epigenetic_Age_Hannum$Sex <- factor(AMD_Epigenetic_Age_Hannum$Sex)
ggplot(data = AMD_Epigenetic_Age_Hannum,
aes(x = Sample_Group, y = Winz_AgeAccelResidualHannum, fill = Sex)) +
geom_boxplot(position = position_dodge(0.7), width = 0.5, outlier.shape = NA) +
geom_jitter(aes(fill = Sex), size = 1, shape = 21, alpha = 0.7,
position = position_jitterdodge(dodge.width = 0.75, jitter.width = 0.3)) +
xlab("Disease State") + ylab("Winsorized Hannum Age Acceleration Residual") +
scale_x_discrete(breaks = c("age-related macular degeneration","normal"),
labels = c("AMD", "Normal")) +
scale_fill_manual(values = c("#990000", "#2c7dab")) +
theme_classic() + theme(
panel.border = element_blank(),
panel.background = element_blank(),
plot.title = element_text(size = 14, family = "Tahoma", face = "bold"),
text = element_text(family = "Tahoma"),
axis.title = element_text(face = "bold"),
axis.text.x = element_text(colour = "black", size = 11),
axis.text.y = element_text(colour = "black", size = 9),
axis.line = element_line(size = 0.5, colour = "black"))#Determine statistical significance between disease state vs control that is stratified by sex with ANOVA.
#Compute the analysis of variance
summary(lm(Winz_AgeAccelResidualHannum ~ Sample_Group + Sex + Sex*Sample_Group, data = AMD_Epigenetic_Age_Hannum))##
## Call:
## lm(formula = Winz_AgeAccelResidualHannum ~ Sample_Group + Sex +
## Sex * Sample_Group, data = AMD_Epigenetic_Age_Hannum)
##
## Residuals:
## Min 1Q Median 3Q Max
## -12.6332 -3.6662 0.3387 3.7893 11.6195
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.5259 1.7583 -1.437 0.1586
## Sample_Groupnormal -0.9723 2.7401 -0.355 0.7246
## SexM 5.2760 2.2699 2.324 0.0253 *
## Sample_Groupnormal:SexM -0.9407 3.4850 -0.270 0.7886
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.56 on 40 degrees of freedom
## Multiple R-squared: 0.1799, Adjusted R-squared: 0.1184
## F-statistic: 2.925 on 3 and 40 DF, p-value: 0.04541
summary(aov(Winz_AgeAccelResidualHannum ~ Sample_Group + Sex + Sex*Sample_Group, data = AMD_Epigenetic_Age_Hannum))## Df Sum Sq Mean Sq F value Pr(>F)
## Sample_Group 1 21.2 21.15 0.684 0.41305
## Sex 1 247.9 247.86 8.017 0.00722 **
## Sample_Group:Sex 1 2.3 2.25 0.073 0.78862
## Residuals 40 1236.6 30.92
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Developed from 513 CpG sites, this clock was built by Levine et al. to serve as prediction of a surrogate measure of “phenotypic age” that, in and of itself, differentiates morbidity and mortality risk among same-age individuals (optimized to predict a multi-system proxy of physiological dysregulation (phenotypic age)). The clock shares 41 CpGs in the Horvath clock and 5 CpGs in the Hannum clock (which was also present in the Horvath clock).
#Subset relevant epigenetic age variables.
AMD_Epigenetic_PhenoAge <- AMD_pData[, c("Sample_Name", "Sample_Group", "Array", "Slide", "Sex", "predictedGender", "Tissue", "predictedTissue", "Age", "DNAmPhenoAge", "AgeAccelPheno")]
#Create function to plot linear regression.
ggplotRegression <- function (fit) {
require(ggplot2)
ggplot(fit$model, aes_string(x = names(fit$model)[2], y = names(fit$model)[1])) +
geom_point(aes(colour = AMD_Epigenetic_PhenoAge$Sex, shape = AMD_Epigenetic_PhenoAge$Sample_Group)) +
stat_smooth(method = "lm", col = "red")
}
#Regression plot of PhenoAge predicted DNAmAge against reported age:
AMD_PhenoAge_plot <- lm(DNAmPhenoAge ~ Age, data = AMD_Epigenetic_PhenoAge)
ggplotRegression(AMD_PhenoAge_plot) + labs(x = "Reported age", y = "PhenoAge predicted age", caption = "Comparing reported age and predicted age using PhenoAge epigenetic clock") +
scale_colour_manual(name = "Sex", values = c("#990000", "#2c7dab")) +
scale_shape_manual(name = "Disease State", values = c(20, 23), labels = c("AMD", "Normal")) +
stat_cor(method = "spearman") +
theme_bw() + theme(panel.border = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.line = element_line(colour = "black"),
axis.text = element_text(),
axis.title = element_text(size = 8),
plot.caption = element_text(hjust = 0.5, size = rel(1)))## `geom_smooth()` using formula 'y ~ x'
#Plot of overall disease state vs control:
AMD_Epigenetic_PhenoAge$Sample_Group <- factor(AMD_Epigenetic_PhenoAge$Sample_Group)
ggplot(data = AMD_Epigenetic_PhenoAge,
aes(x = Sample_Group, y = AgeAccelPheno, fill = Sample_Group)) +
geom_boxplot(position = position_dodge(0.7), width = 0.5, outlier.shape = NA) +
geom_jitter(aes(fill = Sample_Group), size = 2, shape = 21, alpha = 0.7,
position = position_jitterdodge(dodge.width = 0.75, jitter.width = 0.3)) +
xlab("Disease State") + ylab("PhenoAge Acceleration Residual") +
scale_x_discrete(breaks = c("age-related macular degeneration","normal"),
labels = c("Age-Related Macular Degeneration", "Normal")) +
scale_fill_manual(values = c("#77284e", "#ffffff")) +
theme_classic() + theme(
panel.border = element_blank(),
panel.background = element_blank(),
plot.title = element_text(size = 14, family = "Tahoma", face = "bold"),
text = element_text(family = "Tahoma"),
axis.title = element_text(face = "bold"),
axis.text.x = element_text(colour = "black", size = 11),
axis.text.y = element_text(colour = "black", size = 9),
axis.line = element_line(size = 0.5, colour = "black"),
legend.position = "none")#Plot of disease state vs control that is stratified by sex:
AMD_Epigenetic_PhenoAge$Sex <- factor(AMD_Epigenetic_PhenoAge$Sex)
ggplot(data = AMD_Epigenetic_PhenoAge,
aes(x = Sample_Group, y = AgeAccelPheno, fill = Sex)) +
geom_boxplot(position = position_dodge(0.7), width = 0.5, outlier.shape = NA) +
geom_jitter(aes(fill = Sex), size = 1, shape = 21, alpha = 0.7,
position = position_jitterdodge(dodge.width = 0.75, jitter.width = 0.3)) +
xlab("Disease State") + ylab("PhenoAge Acceleration Residual") +
scale_x_discrete(breaks = c("age-related macular degeneration","normal"),
labels = c("AMD", "Normal")) +
scale_fill_manual(values = c("#990000", "#2c7dab")) +
theme_classic() + theme(
panel.border = element_blank(),
panel.background = element_blank(),
plot.title = element_text(size = 14, family = "Tahoma", face = "bold"),
text = element_text(family = "Tahoma"),
axis.title = element_text(face = "bold"),
axis.text.x = element_text(colour = "black", size = 11),
axis.text.y = element_text(colour = "black", size = 9),
axis.line = element_line(size = 0.5, colour = "black"))#Determine statistical significance between disease state vs control with ANOVA.
summary(lm(AgeAccelPheno ~ Sample_Group + Sex + Sex*Sample_Group, data = AMD_Epigenetic_PhenoAge))##
## Call:
## lm(formula = AgeAccelPheno ~ Sample_Group + Sex + Sex * Sample_Group,
## data = AMD_Epigenetic_PhenoAge)
##
## Residuals:
## Min 1Q Median 3Q Max
## -35.161 -7.392 4.113 8.431 22.159
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6.3813 4.6761 -1.365 0.1800
## Sample_Groupnormal -2.6576 7.2871 -0.365 0.7173
## SexM 11.9156 6.0368 1.974 0.0553 .
## Sample_Groupnormal:SexM 0.7959 9.2683 0.086 0.9320
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 14.79 on 40 degrees of freedom
## Multiple R-squared: 0.1547, Adjusted R-squared: 0.09128
## F-statistic: 2.44 on 3 and 40 DF, p-value: 0.07843
summary(aov(AgeAccelPheno ~ Sample_Group + Sex + Sex*Sample_Group, data = AMD_Epigenetic_PhenoAge))## Df Sum Sq Mean Sq F value Pr(>F)
## Sample_Group 1 34 34.2 0.156 0.6948
## Sex 1 1565 1564.6 7.156 0.0108 *
## Sample_Group:Sex 1 2 1.6 0.007 0.9320
## Residuals 40 8746 218.7
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
The Skin & Blood clock (based on 391 CpGs) was developed to accurately measure the age of human fibroblasts, keratinocytes, buccal cells, endothelial cells, skin and blood samples. This clock shares 45 CpGs (out of 71 CpGs) with the blood-based clock from Hannum (2013) and 60 CpGs (out of 353 CpGs) with the pan tissue clock from Horvath (2013). Despite this significant overlap, epigenetic age acceleration of the skin & blood clock exhibits only moderate correlations with corresponding epigenetic age acceleration measures by Horvath (2013) and Hannum (2013) (r=0.5 and r=0.59, p<1.E-110) in the blood samples from the Women’s Health Initiative (BA23 study).
#Subset relevant epigenetic age variables.
AMD_Epigenetic_AgeSkinBlood <- AMD_pData[, c("Sample_Name", "Sample_Group", "Array", "Slide", "Sex", "predictedGender", "Tissue", "predictedTissue", "Age", "DNAmAgeSkinBloodClock")]
AMD_Epigenetic_AgeSkinBlood$AgeAccelSkinBlood <- lm(DNAmAgeSkinBloodClock ~ Age, data = AMD_Epigenetic_AgeSkinBlood)$residuals
#Create function to plot linear regression.
ggplotRegression <- function (fit) {
require(ggplot2)
ggplot(fit$model, aes_string(x = names(fit$model)[2], y = names(fit$model)[1])) +
geom_point(aes(colour = AMD_Epigenetic_AgeSkinBlood$Sex, shape = AMD_Epigenetic_AgeSkinBlood$Sample_Group)) +
stat_smooth(method = "lm", col = "red")
}
#Regression plot of Skin & Blood clock predicted DNAmAge against reported age:
AMD_AgeSkinBlood_plot <- lm(DNAmAgeSkinBloodClock ~ Age, data = AMD_Epigenetic_AgeSkinBlood)
ggplotRegression(AMD_AgeSkinBlood_plot) + labs(x = "Reported age", y = "Skin & Blood predicted age", caption = "Comparing reported age and predicted age using Skin & Blood epigenetic clock") +
scale_colour_manual(name = "Sex", values = c("#990000", "#2c7dab")) +
scale_shape_manual(name = "Disease State", values = c(20, 23), labels = c("AMD", "Normal")) +
stat_cor(method = "spearman") +
theme_bw() + theme(panel.border = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.line = element_line(colour = "black"),
axis.text = element_text(),
axis.title = element_text(size = 8),
plot.caption = element_text(hjust = 0.5, size = rel(1)))## `geom_smooth()` using formula 'y ~ x'
#Plot of overall disease state vs control:
AMD_Epigenetic_AgeSkinBlood$Sample_Group <- factor(AMD_Epigenetic_AgeSkinBlood$Sample_Group)
ggplot(data = AMD_Epigenetic_AgeSkinBlood,
aes(x = Sample_Group, y = AgeAccelSkinBlood, fill = Sample_Group)) +
geom_boxplot(position = position_dodge(0.7), width = 0.5, outlier.shape = NA) +
geom_jitter(aes(fill = Sample_Group), size = 2, shape = 21, alpha = 0.7,
position = position_jitterdodge(dodge.width = 0.75, jitter.width = 0.3)) +
xlab("Disease State") + ylab("Skin & Blood Age Acceleration Residual") +
scale_x_discrete(breaks = c("age-related macular degeneration","normal"),
labels = c("Age-Related Macular Degeneration", "Normal")) +
scale_fill_manual(values = c("#77284e", "#ffffff")) +
theme_classic() + theme(
panel.border = element_blank(),
panel.background = element_blank(),
plot.title = element_text(size = 14, family = "Tahoma", face = "bold"),
text = element_text(family = "Tahoma"),
axis.title = element_text(face = "bold"),
axis.text.x = element_text(colour = "black", size = 11),
axis.text.y = element_text(colour = "black", size = 9),
axis.line = element_line(size = 0.5, colour = "black"),
legend.position = "none")#Plot of disease state vs control that is stratified by sex:
AMD_Epigenetic_AgeSkinBlood$Sex <- factor(AMD_Epigenetic_AgeSkinBlood$Sex)
ggplot(data = AMD_Epigenetic_AgeSkinBlood,
aes(x = Sample_Group, y = AgeAccelSkinBlood, fill = Sex)) +
geom_boxplot(position = position_dodge(0.7), width = 0.5, outlier.shape = NA) +
geom_jitter(aes(fill = Sex), size = 1, shape = 21, alpha = 0.7,
position = position_jitterdodge(dodge.width = 0.75, jitter.width = 0.3)) +
xlab("Disease State") + ylab("Skin & Blood Age Acceleration Residual") +
scale_x_discrete(breaks = c("age-related macular degeneration","normal"),
labels = c("AMD", "Normal")) +
scale_fill_manual(values = c("#990000", "#2c7dab")) +
theme_classic() + theme(
panel.border = element_blank(),
panel.background = element_blank(),
plot.title = element_text(size = 14, family = "Tahoma", face = "bold"),
text = element_text(family = "Tahoma"),
axis.title = element_text(face = "bold"),
axis.text.x = element_text(colour = "black", size = 11),
axis.text.y = element_text(colour = "black", size = 9),
axis.line = element_line(size = 0.5, colour = "black"))#Determine statistical significance between disease state vs control with ANOVA.
summary(lm(AgeAccelSkinBlood ~ Sample_Group + Sex + Sex*Sample_Group, data = AMD_Epigenetic_AgeSkinBlood))##
## Call:
## lm(formula = AgeAccelSkinBlood ~ Sample_Group + Sex + Sex * Sample_Group,
## data = AMD_Epigenetic_AgeSkinBlood)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.8670 -6.9018 0.5594 4.7605 15.4146
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.1484 2.3529 -0.913 0.367
## Sample_Groupnormal -1.2454 3.6668 -0.340 0.736
## SexM 4.6068 3.0376 1.517 0.137
## Sample_Groupnormal:SexM -0.5158 4.6636 -0.111 0.912
##
## Residual standard error: 7.441 on 40 degrees of freedom
## Multiple R-squared: 0.09159, Adjusted R-squared: 0.02346
## F-statistic: 1.344 on 3 and 40 DF, p-value: 0.2736
summary(aov(AgeAccelSkinBlood ~ Sample_Group + Sex + Sex*Sample_Group, data = AMD_Epigenetic_AgeSkinBlood))## Df Sum Sq Mean Sq F value Pr(>F)
## Sample_Group 1 21.9 21.94 0.396 0.5326
## Sex 1 200.7 200.65 3.624 0.0642 .
## Sample_Group:Sex 1 0.7 0.68 0.012 0.9125
## Residuals 40 2214.5 55.36
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
AMD_Epigenetic_Age_Zhang <- AMD_samplesheet[, c("Sample_Name", "Sample_Group", "Tissue", "Sex", "Age")]
AMD_betas.Zhang <- getBeta(AMD_MSet.noob)Adapted from predictor code for Zhang clock.
This script is used to do age prediction based on Illumina 450K DNA methylation data. Coefficients of the predictor are based on 13,566 training samples. Two age predictors based on different methods (Elastic Net and BLUP) are developed based on 514 DNAm sites.
#Function changes missing value to mean value across all individuals for each probe.
addna <- function(methy){
methy[is.na(methy)]<-mean(methy,na.rm=T)
return(methy)
}
#Load beta matrix and check format.
if(nrow(AMD_betas.Zhang) > ncol(AMD_betas.Zhang)){
AMD_betas.Zhang <- t(AMD_betas.Zhang)
}
#Replace any NA with mean value for each probe.
dataNona <- apply(AMD_betas.Zhang, 2, function(x) addna(x))
#Standardise beta values within each individual by removing the mean and dividing by standard deviation for each individual. (Probe * IND).
dataNona.norm <- apply(dataNona, 1, scale)
rownames(dataNona.norm) <- colnames(dataNona)
#Get the ElasticNet predictor coefficient of each probe.
encoef <- read.table("~/KoborLab/kobor_space/kendrix/macular_degeneration/Zhang_Age_Predictor/en.coef",
stringsAsFactor = FALSE, header = TRUE)
en_int <- encoef[1,2]
encoef <- encoef[-1,]
rownames(encoef) <- encoef$probe
#Get the BLUP coefficient of each probe.
blupcoef <- read.table("~/KoborLab/kobor_space/kendrix/macular_degeneration/Zhang_Age_Predictor/blup.coef",
stringsAsFactor = FALSE, header = TRUE)
blup_int <- blupcoef[1,2]
blupcoef <- blupcoef[-1,]
rownames(blupcoef) <- blupcoef$probe
#Get common probes between predictors and data.
encomm <- intersect(rownames(encoef), rownames(dataNona.norm))
blupcomm <- intersect(rownames(blupcoef), rownames(dataNona.norm))
#Check for missing probes.
endiff <- nrow(encoef) - length(encomm)
blupdiff <- nrow(blupcoef) - length(blupcomm)
print(paste0 (endiff, " probe(s) in Elastic Net predictor is(are) not in the data"))## [1] "0 probe(s) in Elastic Net predictor is(are) not in the data"
print(paste0 (blupdiff, " probe(s) in BLUP predictor is(are) not in the data"))## [1] "0 probe(s) in BLUP predictor is(are) not in the data"
print("BLUP can perform better if the number of missing probes is too large!")## [1] "BLUP can perform better if the number of missing probes is too large!"
#Extract common probes and perform age prediction.
encoef <- encoef[encomm,]
blupcoef <- blupcoef[blupcomm,]
ElasticNetPredAge <- encoef$coef%*%dataNona.norm[encomm,] + en_int
BLUPPredAge <- blupcoef$coef%*%dataNona.norm[blupcomm,] + blup_int
#Add predicted results to metadata.
ElasticNetPredAge <- ElasticNetPredAge[, AMD_Epigenetic_Age_Zhang$Sample_Name]
AMD_Epigenetic_Age_Zhang$ElasticNetPredAge <- as.double(ElasticNetPredAge)
AMD_Epigenetic_Age_Zhang$ElasticNetAgeAccelResidual <- lm(ElasticNetPredAge ~ Age,
data = AMD_Epigenetic_Age_Zhang)$residuals
BLUPPredAge <- BLUPPredAge[, AMD_Epigenetic_Age_Zhang$Sample_Name]
AMD_Epigenetic_Age_Zhang$BLUPPredAge <- as.double(BLUPPredAge)
AMD_Epigenetic_Age_Zhang$BLUPAgeAccelResidual <- lm(BLUPPredAge ~ Age,
data = AMD_Epigenetic_Age_Zhang)$residuals#Create function to plot linear regression.
ggplotRegression <- function (fit) {
require(ggplot2)
ggplot(fit$model, aes_string(x = names(fit$model)[2], y = names(fit$model)[1])) +
geom_point(aes(colour = AMD_Epigenetic_Age_Zhang$Sex, shape = AMD_Epigenetic_Age_Zhang$Sample_Group)) +
stat_smooth(method = "lm", col = "red")
}
#Regression plot of Zhang clock predicted DNAmAge against reported age:
AMD_ENZhang_plot <- lm(ElasticNetPredAge ~ Age, data = AMD_Epigenetic_Age_Zhang)
ggplotRegression(AMD_ENZhang_plot) + labs(x = "Reported age", y = "Zhang predicted age", caption = "Comparing reported age and predicted age using Zhang epigenetic clock - ElasticNet method") +
scale_colour_manual(name = "Sex", values = c("#990000", "#2c7dab")) +
scale_shape_manual(name = "Disease State", values = c(20, 23), labels = c("AMD", "Normal")) +
stat_cor(method = "spearman") +
theme_bw() + theme(panel.border = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.line = element_line(colour = "black"),
axis.text = element_text(),
axis.title = element_text(size = 8),
plot.caption = element_text(hjust = 0.5, size = rel(1)))## `geom_smooth()` using formula 'y ~ x'
#Plot of overall disease state vs control:
AMD_Epigenetic_Age_Zhang$Sample_Group <- factor(AMD_Epigenetic_Age_Zhang$Sample_Group)
ggplot(data = AMD_Epigenetic_Age_Zhang,
aes(x = Sample_Group, y = ElasticNetAgeAccelResidual, fill = Sample_Group)) +
geom_boxplot(position = position_dodge(0.7), width = 0.5, outlier.shape = NA) +
geom_jitter(aes(fill = Sample_Group), size = 2, shape = 21, alpha = 0.7,
position = position_jitterdodge(dodge.width = 0.75, jitter.width = 0.3)) +
xlab("Disease State") + ylab("Zhang Acceleration Residual (ElasticNet)") +
scale_x_discrete(breaks = c("age-related macular degeneration","normal"),
labels = c("Age-Related Macular Degeneration", "Normal")) +
scale_fill_manual(values = c("#77284e", "#ffffff")) +
theme_classic() + theme(
panel.border = element_blank(),
panel.background = element_blank(),
plot.title = element_text(size = 14, family = "Tahoma", face = "bold"),
text = element_text(family = "Tahoma"),
axis.title = element_text(face = "bold"),
axis.text.x = element_text(colour = "black", size = 11),
axis.text.y = element_text(colour = "black", size = 9),
axis.line = element_line(size = 0.5, colour = "black"),
legend.position = "none")#Plot of disease state vs control that is stratified by sex:
AMD_Epigenetic_Age_Zhang$Sex <- factor(AMD_Epigenetic_Age_Zhang$Sex)
ggplot(data = AMD_Epigenetic_Age_Zhang,
aes(x = Sample_Group, y = ElasticNetAgeAccelResidual, fill = Sex)) +
geom_boxplot(position = position_dodge(0.7), width = 0.5, outlier.shape = NA) +
geom_jitter(aes(fill = Sex), size = 1, shape = 21, alpha = 0.7,
position = position_jitterdodge(dodge.width = 0.75, jitter.width = 0.3)) +
xlab("Disease State") + ylab("Zhang Acceleration Residual (ElasticNet)") +
scale_x_discrete(breaks = c("age-related macular degeneration","normal"),
labels = c("AMD", "Normal")) +
scale_fill_manual(values = c("#990000", "#2c7dab")) +
theme_classic() + theme(
panel.border = element_blank(),
panel.background = element_blank(),
plot.title = element_text(size = 14, family = "Tahoma", face = "bold"),
text = element_text(family = "Tahoma"),
axis.title = element_text(face = "bold"),
axis.text.x = element_text(colour = "black", size = 11),
axis.text.y = element_text(colour = "black", size = 9),
axis.line = element_line(size = 0.5, colour = "black"))#Determine statistical significance between disease state vs control with ANOVA.
summary(lm(ElasticNetAgeAccelResidual ~ Sample_Group + Sex + Sex*Sample_Group, data = AMD_Epigenetic_Age_Zhang))##
## Call:
## lm(formula = ElasticNetAgeAccelResidual ~ Sample_Group + Sex +
## Sex * Sample_Group, data = AMD_Epigenetic_Age_Zhang)
##
## Residuals:
## Min 1Q Median 3Q Max
## -19.356 -5.762 2.333 4.889 12.997
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.7594 2.4807 -1.919 0.0622 .
## Sample_Groupnormal 0.1734 3.8658 0.045 0.9645
## SexM 8.2229 3.2025 2.568 0.0141 *
## Sample_Groupnormal:SexM -1.3250 4.9168 -0.269 0.7889
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.845 on 40 degrees of freedom
## Multiple R-squared: 0.2006, Adjusted R-squared: 0.1407
## F-statistic: 3.347 on 3 and 40 DF, p-value: 0.02842
summary(aov(ElasticNetAgeAccelResidual ~ Sample_Group + Sex + Sex*Sample_Group, data = AMD_Epigenetic_Age_Zhang))## Df Sum Sq Mean Sq F value Pr(>F)
## Sample_Group 1 1.8 1.8 0.029 0.86654
## Sex 1 611.6 611.6 9.939 0.00306 **
## Sample_Group:Sex 1 4.5 4.5 0.073 0.78894
## Residuals 40 2461.5 61.5
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Create function to plot linear regression.
ggplotRegression <- function (fit) {
require(ggplot2)
ggplot(fit$model, aes_string(x = names(fit$model)[2], y = names(fit$model)[1])) +
geom_point(aes(colour = AMD_Epigenetic_Age_Zhang$Sex, shape = AMD_Epigenetic_Age_Zhang$Sample_Group)) +
stat_smooth(method = "lm", col = "red")
}
#Regression plot of Zhang clock predicted DNAmAge against reported age:
AMD_BLUPZhang_plot <- lm(BLUPPredAge ~ Age, data = AMD_Epigenetic_Age_Zhang)
ggplotRegression(AMD_BLUPZhang_plot) + labs(x = "Reported age", y = "Zhang predicted age", caption = "Comparing reported age and predicted age using Zhang epigenetic clock - BLUP method") +
scale_colour_manual(name = "Sex", values = c("#990000", "#2c7dab")) +
scale_shape_manual(name = "Disease State", values = c(20, 23), labels = c("AMD", "Normal")) +
stat_cor(method = "spearman") +
theme_bw() + theme(panel.border = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.line = element_line(colour = "black"),
axis.text = element_text(),
axis.title = element_text(size = 8),
plot.caption = element_text(hjust = 0.5, size = rel(1)))## `geom_smooth()` using formula 'y ~ x'
#Plot of overall disease state vs control:
AMD_Epigenetic_Age_Zhang$Sample_Group <- factor(AMD_Epigenetic_Age_Zhang$Sample_Group)
ggplot(data = AMD_Epigenetic_Age_Zhang,
aes(x = Sample_Group, y = BLUPAgeAccelResidual, fill = Sample_Group)) +
geom_boxplot(position = position_dodge(0.7), width = 0.5, outlier.shape = NA) +
geom_jitter(aes(fill = Sample_Group), size = 2, shape = 21, alpha = 0.7,
position = position_jitterdodge(dodge.width = 0.75, jitter.width = 0.3)) +
xlab("Disease State") + ylab("Zhang Acceleration Residual (BLUP)") +
scale_x_discrete(breaks = c("age-related macular degeneration","normal"),
labels = c("Age-Related Macular Degeneration", "Normal")) +
scale_fill_manual(values = c("#77284e", "#ffffff")) +
theme_classic() + theme(
panel.border = element_blank(),
panel.background = element_blank(),
plot.title = element_text(size = 14, family = "Tahoma", face = "bold"),
text = element_text(family = "Tahoma"),
axis.title = element_text(face = "bold"),
axis.text.x = element_text(colour = "black", size = 11),
axis.text.y = element_text(colour = "black", size = 9),
axis.line = element_line(size = 0.5, colour = "black"),
legend.position = "none")#Plot of disease state vs control that is stratified by sex:
AMD_Epigenetic_Age_Zhang$Sex <- factor(AMD_Epigenetic_Age_Zhang$Sex)
ggplot(data = AMD_Epigenetic_Age_Zhang,
aes(x = Sample_Group, y = BLUPAgeAccelResidual, fill = Sex)) +
geom_boxplot(position = position_dodge(0.7), width = 0.5, outlier.shape = NA) +
geom_jitter(aes(fill = Sex), size = 1, shape = 21, alpha = 0.7,
position = position_jitterdodge(dodge.width = 0.75, jitter.width = 0.3)) +
xlab("Disease State") + ylab("Zhang Acceleration Residual (BLUP)") +
scale_x_discrete(breaks = c("age-related macular degeneration","normal"),
labels = c("AMD", "Normal")) +
scale_fill_manual(values = c("#990000", "#2c7dab")) +
theme_classic() + theme(
panel.border = element_blank(),
panel.background = element_blank(),
plot.title = element_text(size = 14, family = "Tahoma", face = "bold"),
text = element_text(family = "Tahoma"),
axis.title = element_text(face = "bold"),
axis.text.x = element_text(colour = "black", size = 11),
axis.text.y = element_text(colour = "black", size = 9),
axis.line = element_line(size = 0.5, colour = "black"))#Determine statistical significance between disease state vs control with ANOVA.
summary(lm(BLUPAgeAccelResidual ~ Sample_Group + Sex + Sex*Sample_Group, data = AMD_Epigenetic_Age_Zhang))##
## Call:
## lm(formula = BLUPAgeAccelResidual ~ Sample_Group + Sex + Sex *
## Sample_Group, data = AMD_Epigenetic_Age_Zhang)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.261 -3.967 1.590 3.561 11.936
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.9243 1.8955 -1.543 0.1308
## Sample_Groupnormal -1.1535 2.9539 -0.390 0.6982
## SexM 5.7728 2.4471 2.359 0.0233 *
## Sample_Groupnormal:SexM -0.4401 3.7570 -0.117 0.9073
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.994 on 40 degrees of freedom
## Multiple R-squared: 0.1925, Adjusted R-squared: 0.1319
## F-statistic: 3.178 on 3 and 40 DF, p-value: 0.03426
summary(aov(BLUPAgeAccelResidual ~ Sample_Group + Sex + Sex*Sample_Group, data = AMD_Epigenetic_Age_Zhang))## Df Sum Sq Mean Sq F value Pr(>F)
## Sample_Group 1 16.8 16.8 0.469 0.49747
## Sex 1 325.2 325.2 9.051 0.00453 **
## Sample_Group:Sex 1 0.5 0.5 0.014 0.90734
## Residuals 40 1437.1 35.9
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
This script is used to do age prediction based on Illumina 450K DNA methylation data. Coefficients of the predictor are based on 1,397 cortical tissues (Age range: 1 - 104 years old). Shireby et al. identified 347 DNAm sites that, in combination optimally predict age in the human cortex using elastic net regression.
#Get beta matrix.
AMD_betas.Cortical <- getBeta(AMD_MSet.noob)
#Check if sample name order match in pData and beta matrix.
identical(rownames(AMD_pData), colnames(AMD_betas.Cortical)) #TRUE.## [1] TRUE
#Run DNAmCortical clock.
source("~/KoborLab/kobor_space/kendrix/macular_degeneration/CorticalClock/CorticalClock.r")
setwd("~/KoborLab/kobor_space/kendrix/macular_degeneration/data/")
CorticalClock(betas = AMD_betas.Cortical, pheno = AMD_pData,
dir = "~/KoborLab/kobor_space/kendrix/macular_degeneration/CorticalClock/", IDcol = "Sample_Name",
Agecol = "Age")## [1] "all the probes overlap between your data and the clock probes - no need for imputing missing values"
## Cortical Clock
## Correlation (r) 0.45
## RMSE (years) 43.94
## MAD (years) 43.29
## [1] "Your Cortical DNAm ages are saved in the file 'CorticalPred.csv'"
## [1] "Accuracy statistics comparing DNAm age and chronological age are saved in the file 'accuracy_statistics.csv'"
## [1] "Plot of Cortical DNAm age against age is saved in the file 'CorticalClockplot.pdf'"
#Add predicted values to the cortical pData.
AMD_Epigenetic_Age_Cortical <- AMD_samplesheet[, c("Sample_Name", "Sample_Group", "Tissue", "Sex", "Age")]
DNAmCortical <- read.csv("~/KoborLab/kobor_space/kendrix/macular_degeneration/data/CorticalPred.csv", header = TRUE)
identical(AMD_Epigenetic_Age_Cortical$Sample_Name, DNAmCortical$ID) #TRUE.## [1] TRUE
AMD_Epigenetic_Age_Cortical$DNAmCorticalPredAge <- DNAmCortical$brainpred
AMD_Epigenetic_Age_Cortical$DNAmCorticalAgeAccelResidual <- lm(DNAmCorticalPredAge ~ Age,
data = AMD_Epigenetic_Age_Cortical)$residuals#Create function to plot linear regression.
ggplotRegression <- function (fit) {
require(ggplot2)
ggplot(fit$model, aes_string(x = names(fit$model)[2], y = names(fit$model)[1])) +
geom_point(aes(colour = AMD_Epigenetic_Age_Cortical$Sex, shape = AMD_Epigenetic_Age_Cortical$Sample_Group)) +
stat_smooth(method = "lm", col = "red")
}
#Regression plot of Zhang clock predicted DNAmAge against reported age:
AMD_Cortical_plot <- lm(DNAmCorticalPredAge ~ Age, data = AMD_Epigenetic_Age_Cortical)
ggplotRegression(AMD_Cortical_plot) + labs(x = "Reported age", y = "DNAmCortical predicted age", caption = "Comparing reported age and predicted age using DNAmCortical epigenetic clock") +
scale_colour_manual(name = "Sex", values = c("#990000", "#2c7dab")) +
scale_shape_manual(name = "Disease State", values = c(20, 23), labels = c("AMD", "Normal")) +
stat_cor(method = "spearman") +
theme_bw() + theme(panel.border = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.line = element_line(colour = "black"),
axis.text = element_text(),
axis.title = element_text(size = 8),
plot.caption = element_text(hjust = 0.5, size = rel(1)))## `geom_smooth()` using formula 'y ~ x'
#Plot of overall disease state vs control:
AMD_Epigenetic_Age_Cortical$Sample_Group <- factor(AMD_Epigenetic_Age_Cortical$Sample_Group)
ggplot(data = AMD_Epigenetic_Age_Cortical,
aes(x = Sample_Group, y = DNAmCorticalAgeAccelResidual, fill = Sample_Group)) +
geom_boxplot(position = position_dodge(0.7), width = 0.5, outlier.shape = NA) +
geom_jitter(aes(fill = Sample_Group), size = 2, shape = 21, alpha = 0.7,
position = position_jitterdodge(dodge.width = 0.75, jitter.width = 0.3)) +
xlab("Disease State") + ylab("DNAmCortical Acceleration Residual") +
scale_x_discrete(breaks = c("age-related macular degeneration","normal"),
labels = c("Age-Related Macular Degeneration", "Normal")) +
scale_fill_manual(values = c("#77284e", "#ffffff")) +
theme_classic() + theme(
panel.border = element_blank(),
panel.background = element_blank(),
plot.title = element_text(size = 14, family = "Tahoma", face = "bold"),
text = element_text(family = "Tahoma"),
axis.title = element_text(face = "bold"),
axis.text.x = element_text(colour = "black", size = 11),
axis.text.y = element_text(colour = "black", size = 9),
axis.line = element_line(size = 0.5, colour = "black"),
legend.position = "none")#Plot of disease state vs control that is stratified by sex:
AMD_Epigenetic_Age_Cortical$Sex <- factor(AMD_Epigenetic_Age_Cortical$Sex)
ggplot(data = AMD_Epigenetic_Age_Cortical,
aes(x = Sample_Group, y = DNAmCorticalAgeAccelResidual, fill = Sex)) +
geom_boxplot(position = position_dodge(0.7), width = 0.5, outlier.shape = NA) +
geom_jitter(aes(fill = Sex), size = 1, shape = 21, alpha = 0.7,
position = position_jitterdodge(dodge.width = 0.75, jitter.width = 0.3)) +
xlab("Disease State") + ylab("DNAmCortical Acceleration Residual") +
scale_x_discrete(breaks = c("age-related macular degeneration","normal"),
labels = c("AMD", "Normal")) +
scale_fill_manual(values = c("#990000", "#2c7dab")) +
theme_classic() + theme(
panel.border = element_blank(),
panel.background = element_blank(),
plot.title = element_text(size = 14, family = "Tahoma", face = "bold"),
text = element_text(family = "Tahoma"),
axis.title = element_text(face = "bold"),
axis.text.x = element_text(colour = "black", size = 11),
axis.text.y = element_text(colour = "black", size = 9),
axis.line = element_line(size = 0.5, colour = "black"))#Determine statistical significance between disease state vs control with ANOVA.
summary(lm(DNAmCorticalAgeAccelResidual ~ Sample_Group + Sex + Sex*Sample_Group, data = AMD_Epigenetic_Age_Cortical))##
## Call:
## lm(formula = DNAmCorticalAgeAccelResidual ~ Sample_Group + Sex +
## Sex * Sample_Group, data = AMD_Epigenetic_Age_Cortical)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.0785 -4.8258 -0.0874 4.3463 21.7664
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.947 2.675 -0.728 0.471
## Sample_Groupnormal -4.298 4.169 -1.031 0.309
## SexM 5.585 3.454 1.617 0.114
## Sample_Groupnormal:SexM 1.379 5.303 0.260 0.796
##
## Residual standard error: 8.46 on 40 degrees of freedom
## Multiple R-squared: 0.1526, Adjusted R-squared: 0.08907
## F-statistic: 2.401 on 3 and 40 DF, p-value: 0.08191
summary(aov(DNAmCorticalAgeAccelResidual ~ Sample_Group + Sex + Sex*Sample_Group, data = AMD_Epigenetic_Age_Cortical))## Df Sum Sq Mean Sq F value Pr(>F)
## Sample_Group 1 114.1 114.1 1.594 0.2141
## Sex 1 396.7 396.7 5.543 0.0236 *
## Sample_Group:Sex 1 4.8 4.8 0.068 0.7962
## Residuals 40 2863.0 71.6
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
AMD_Epigenetic_Age_PedBE <- AMD_samplesheet[, c("Sample_Name", "Sample_Group", "Tissue", "Sex", "Age")]
AMD_betas.PedBE <- getBeta(AMD_MSet.noob)Sourced and adapted from “~/KoborLab/kobor_space/shared_coding_resource/Ped_Age_Predictor_McEwen/Age_Pred_McEwen_NG.R”.
PedAge_McEwen_NG <- function(dat0){
#call required libraries
require(WGCNA)
require(sqldf)
require(impute)
require(flashClust)
require(dynamicTreeCut)
require(RPMM)
source("~/KoborLab/kobor_space/shared_coding_resource/Horvath_Age_Prediction/NORMALIZATION.R")
datClock <- read.csv("~/KoborLab/kobor_space/shared_coding_resource/Ped_Age_Predictor_McEwen/datcoefInteresting94.csv", stringsAsFactors = F)
# Impute missing data
if(sum(is.na(dat0)) > 0) {dat0=data.frame(t(impute.knn(as.matrix(t(dat0)))$data))}
# How many probes are missing from the clock
print(paste("You have", length(rownames(dat0)[rownames(dat0) %in% datClock$ID[2:nrow(datClock)]]), "out of", length(datClock$ID[2:nrow(datClock)]), "probes needed for the age clock. Missing", length(datClock$ID[2:nrow(datClock)]) - length(rownames(dat0)[rownames(dat0) %in% datClock$ID[2:nrow(datClock)]]), "altogether."))
print(paste("Resulting in", round(100-100*(length(rownames(dat0)[rownames(dat0) %in% datClock$ID[2:nrow(datClock)]])/length(datClock$ID[2:nrow(datClock)])), 0),"% missingness. Data will be imputed using training data."))
# Add probe names as the first column
dat0 <- as.data.frame(dat0)
dat0$ProbeID <- rownames(dat0)
dat0 <- dat0[, c(ncol(dat0), 1:(ncol(dat0)-1))]
nSamples <- dim(dat0)[[2]]-1
nProbes <- dim(dat0)[[1]]
#Normalize data
dat1 <- dat0[,-1]
gold.mean <- as.numeric(apply(dat1, 1, median, na.rm = TRUE))
datMethUsed <- t(dat1)
datMethUsedNormalized <- BMIQcalibration(datM = datMethUsed, goldstandard.beta = gold.mean, plots = FALSE)
datM <- t(datMethUsedNormalized)
datM <- as.data.frame(datM)
datM$ProbeID <- rownames(datM)
datM <- datM[, c(ncol(datM), 1:(ncol(datM)-1))]
datMt <- t(datM[,-1])
colnames(datMt) <- as.character(datM[,1])
anti.trafo <- function(x,adult.age=20) {ifelse(x<0, (1+adult.age)*exp(x)-1, (1+adult.age)*x+adult.age)}
selectCpGsClock <- is.element(dimnames(datMt)[[2]], as.character(datClock[,1][-1]))
datMethClock0 <- data.frame(datMt[,selectCpGsClock])
datMethClock <- data.frame(datMethClock0[as.character(datClock[,1][-1])])
datMethClock$DNAmAgeBuccal <- as.numeric(anti.trafo(datClock[1,2]+as.numeric(as.matrix(datMethClock)%*%as.numeric(datClock[,2][-1]))))
datout <- data.frame(SampleID <- rownames(datMethClock), DNAmAge <- datMethClock$DNAmAgeBuccal)
}dim(AMD_betas.PedBE) #485512 rows.## [1] 485512 44
#Remove rows with more than 50% NA
AMD_betas.PedBE <- AMD_betas.PedBE[which(rowMeans(!is.na(AMD_betas.PedBE)) > 0.5), ]
#Run PedBE clock.
PedBE_DNAmAge <- PedAge_McEwen_NG(AMD_betas.PedBE)## Loading required package: WGCNA
## Warning: package 'WGCNA' was built under R version 3.6.3
## Loading required package: dynamicTreeCut
## Warning: package 'dynamicTreeCut' was built under R version 3.6.3
## Loading required package: fastcluster
## Warning: package 'fastcluster' was built under R version 3.6.3
##
## Attaching package: 'fastcluster'
## The following object is masked from 'package:stats':
##
## hclust
##
##
## Attaching package: 'WGCNA'
## The following object is masked from 'package:IRanges':
##
## cor
## The following object is masked from 'package:S4Vectors':
##
## cor
## The following object is masked from 'package:stats':
##
## cor
## Loading required package: sqldf
## Loading required package: gsubfn
## Loading required package: proto
## Warning in fun(libname, pkgname): couldn't connect to display ":0"
## Loading required package: flashClust
## Warning: package 'flashClust' was built under R version 3.6.3
##
## Attaching package: 'flashClust'
## The following object is masked from 'package:fastcluster':
##
## hclust
## The following object is masked from 'package:stats':
##
## hclust
## Loading required package: RPMM
## Warning: package 'RPMM' was built under R version 3.6.3
## Loading required package: cluster
## [1] "You have 94 out of 94 probes needed for the age clock. Missing 0 altogether."
## [1] "Resulting in 0 % missingness. Data will be imputed using training data."
## [1] "Fitting EM beta mixture to goldstandard probes"
## [1] Inf
## [1] 0.004582211
## [1] 0.002934143
## [1] 0.004397809
## [1] 0.004897824
## [1] "Done"
## ii= 1
## [1] "Fitting EM beta mixture to input probes"
## [1] Inf
## [1] 0.004790258
## [1] 0.002362019
## [1] 0.003154597
## [1] 0.003574
## [1] "Done"
## [1] "Start normalising input probes"
## ii= 2
## [1] "Fitting EM beta mixture to input probes"
## [1] Inf
## [1] 0.004211355
## [1] 0.002087919
## [1] 0.003216463
## [1] 0.003465551
## [1] "Done"
## [1] "Start normalising input probes"
## ii= 3
## [1] "Fitting EM beta mixture to input probes"
## [1] Inf
## [1] 0.004627178
## [1] 0.002259131
## [1] 0.002849442
## [1] 0.003346199
## [1] "Done"
## [1] "Start normalising input probes"
## ii= 4
## [1] "Fitting EM beta mixture to input probes"
## [1] Inf
## [1] 0.004597235
## [1] 0.002196956
## [1] 0.002015107
## [1] 0.002496612
## [1] "Done"
## [1] "Start normalising input probes"
## ii= 5
## [1] "Fitting EM beta mixture to input probes"
## [1] Inf
## [1] 0.005547865
## [1] 0.002994156
## [1] 0.001823028
## [1] 0.002511824
## [1] "Done"
## [1] "Start normalising input probes"
## ii= 6
## [1] "Fitting EM beta mixture to input probes"
## [1] Inf
## [1] 0.00471737
## [1] 0.002323473
## [1] 0.00152028
## [1] 0.00206841
## [1] "Done"
## [1] "Start normalising input probes"
## ii= 7
## [1] "Fitting EM beta mixture to input probes"
## [1] Inf
## [1] 0.004640662
## [1] 0.002265562
## [1] 0.002637281
## [1] 0.003028979
## [1] "Done"
## [1] "Start normalising input probes"
## ii= 8
## [1] "Fitting EM beta mixture to input probes"
## [1] Inf
## [1] 0.004751371
## [1] 0.002334002
## [1] 0.001571872
## [1] 0.002123473
## [1] "Done"
## [1] "Start normalising input probes"
## ii= 9
## [1] "Fitting EM beta mixture to input probes"
## [1] Inf
## [1] 0.004536413
## [1] 0.002096663
## [1] 0.001106063
## [1] 0.001357692
## [1] "Done"
## [1] "Start normalising input probes"
## ii= 10
## [1] "Fitting EM beta mixture to input probes"
## [1] Inf
## [1] 0.00424201
## [1] 0.001385648
## [1] 0.0007296178
## [1] "Done"
## [1] "Start normalising input probes"
## ii= 11
## [1] "Fitting EM beta mixture to input probes"
## [1] Inf
## [1] 0.004159753
## [1] 0.001882673
## [1] 0.001675138
## [1] 0.00212326
## [1] "Done"
## [1] "Start normalising input probes"
## ii= 12
## [1] "Fitting EM beta mixture to input probes"
## [1] Inf
## [1] 0.004269302
## [1] 0.001959428
## [1] 0.00267937
## [1] 0.002906533
## [1] "Done"
## [1] "Start normalising input probes"
## ii= 13
## [1] "Fitting EM beta mixture to input probes"
## [1] Inf
## [1] 0.004772533
## [1] 0.002340347
## [1] 0.002208179
## [1] 0.002715199
## [1] "Done"
## [1] "Start normalising input probes"
## ii= 14
## [1] "Fitting EM beta mixture to input probes"
## [1] Inf
## [1] 0.004664102
## [1] 0.001201669
## [1] 0.0007497667
## [1] "Done"
## [1] "Start normalising input probes"
## ii= 15
## [1] "Fitting EM beta mixture to input probes"
## [1] Inf
## [1] 0.004363746
## [1] 0.002013022
## [1] 0.001216025
## [1] 0.001807108
## [1] "Done"
## [1] "Start normalising input probes"
## ii= 16
## [1] "Fitting EM beta mixture to input probes"
## [1] Inf
## [1] 0.004571076
## [1] 0.002422267
## [1] 0.003608208
## [1] 0.003946688
## [1] "Done"
## [1] "Start normalising input probes"
## ii= 17
## [1] "Fitting EM beta mixture to input probes"
## [1] Inf
## [1] 0.004152306
## [1] 0.001905675
## [1] 0.002089056
## [1] 0.002558118
## [1] "Done"
## [1] "Start normalising input probes"
## ii= 18
## [1] "Fitting EM beta mixture to input probes"
## [1] Inf
## [1] 0.003649308
## [1] 0.001259969
## [1] 0.00104714
## [1] 0.001412601
## [1] "Done"
## [1] "Start normalising input probes"
## ii= 19
## [1] "Fitting EM beta mixture to input probes"
## [1] Inf
## [1] 0.003979101
## [1] 0.001780051
## [1] 0.001298679
## [1] 0.001807178
## [1] "Done"
## [1] "Start normalising input probes"
## ii= 20
## [1] "Fitting EM beta mixture to input probes"
## [1] Inf
## [1] 0.003537082
## [1] 0.00146439
## [1] 0.001389145
## [1] 0.001772284
## [1] "Done"
## [1] "Start normalising input probes"
## ii= 21
## [1] "Fitting EM beta mixture to input probes"
## [1] Inf
## [1] 0.004986264
## [1] 0.001151158
## [1] 0.0008174939
## [1] "Done"
## [1] "Start normalising input probes"
## ii= 22
## [1] "Fitting EM beta mixture to input probes"
## [1] Inf
## [1] 0.004631683
## [1] 0.002214069
## [1] 0.002377576
## [1] 0.002798655
## [1] "Done"
## [1] "Start normalising input probes"
## ii= 23
## [1] "Fitting EM beta mixture to input probes"
## [1] Inf
## [1] 0.003066756
## [1] 0.001126385
## [1] 0.001409774
## [1] 0.001613505
## [1] "Done"
## [1] "Start normalising input probes"
## ii= 24
## [1] "Fitting EM beta mixture to input probes"
## [1] Inf
## [1] 0.004076159
## [1] 0.001866704
## [1] 0.002101919
## [1] 0.002624584
## [1] "Done"
## [1] "Start normalising input probes"
## ii= 25
## [1] "Fitting EM beta mixture to input probes"
## [1] Inf
## [1] 0.003616447
## [1] 0.001452421
## [1] 0.001132568
## [1] 0.001518411
## [1] "Done"
## [1] "Start normalising input probes"
## ii= 26
## [1] "Fitting EM beta mixture to input probes"
## [1] Inf
## [1] 0.004219319
## [1] 0.002010245
## [1] 0.001974118
## [1] 0.002458454
## [1] "Done"
## [1] "Start normalising input probes"
## ii= 27
## [1] "Fitting EM beta mixture to input probes"
## [1] Inf
## [1] 0.004412387
## [1] 0.001564638
## [1] 0.0006919762
## [1] "Done"
## [1] "Start normalising input probes"
## ii= 28
## [1] "Fitting EM beta mixture to input probes"
## [1] Inf
## [1] 0.002030515
## [1] 0.0009946138
## [1] "Done"
## [1] "Start normalising input probes"
## ii= 29
## [1] "Fitting EM beta mixture to input probes"
## [1] Inf
## [1] 0.004971709
## [1] 0.002497243
## [1] 0.00356292
## [1] 0.004017098
## [1] "Done"
## [1] "Start normalising input probes"
## ii= 30
## [1] "Fitting EM beta mixture to input probes"
## [1] Inf
## [1] 0.004148087
## [1] 0.002074636
## [1] 0.003306652
## [1] 0.003595068
## [1] "Done"
## [1] "Start normalising input probes"
## ii= 31
## [1] "Fitting EM beta mixture to input probes"
## [1] Inf
## [1] 0.004058778
## [1] 0.001821058
## [1] 0.002452984
## [1] 0.00276532
## [1] "Done"
## [1] "Start normalising input probes"
## ii= 32
## [1] "Fitting EM beta mixture to input probes"
## [1] Inf
## [1] 0.003605015
## [1] 0.001404906
## [1] 0.001254988
## [1] 0.001608153
## [1] "Done"
## [1] "Start normalising input probes"
## ii= 33
## [1] "Fitting EM beta mixture to input probes"
## [1] Inf
## [1] 0.004371892
## [1] 0.002025162
## [1] 0.001673002
## [1] 0.002163652
## [1] "Done"
## [1] "Start normalising input probes"
## ii= 34
## [1] "Fitting EM beta mixture to input probes"
## [1] Inf
## [1] 0.001947918
## [1] 0.0009276869
## [1] "Done"
## [1] "Start normalising input probes"
## ii= 35
## [1] "Fitting EM beta mixture to input probes"
## [1] Inf
## [1] 0.004051784
## [1] 0.00186273
## [1] 0.002712085
## [1] 0.003084828
## [1] "Done"
## [1] "Start normalising input probes"
## ii= 36
## [1] "Fitting EM beta mixture to input probes"
## [1] Inf
## [1] 0.003547857
## [1] 0.00149191
## [1] 0.002332382
## [1] 0.002622667
## [1] "Done"
## [1] "Start normalising input probes"
## ii= 37
## [1] "Fitting EM beta mixture to input probes"
## [1] Inf
## [1] 0.005424021
## [1] 0.002940651
## [1] 0.00238355
## [1] 0.003131738
## [1] "Done"
## [1] "Start normalising input probes"
## ii= 38
## [1] "Fitting EM beta mixture to input probes"
## [1] Inf
## [1] 0.004487629
## [1] 0.002120229
## [1] 0.002720539
## [1] 0.003134397
## [1] "Done"
## [1] "Start normalising input probes"
## ii= 39
## [1] "Fitting EM beta mixture to input probes"
## [1] Inf
## [1] 0.005884391
## [1] 0.003196969
## [1] 0.00342038
## [1] 0.004099628
## [1] "Done"
## [1] "Start normalising input probes"
## ii= 40
## [1] "Fitting EM beta mixture to input probes"
## [1] Inf
## [1] 0.004579889
## [1] 0.002172739
## [1] 0.002448184
## [1] 0.002862934
## [1] "Done"
## [1] "Start normalising input probes"
## ii= 41
## [1] "Fitting EM beta mixture to input probes"
## [1] Inf
## [1] 0.004412343
## [1] 0.002079192
## [1] 0.002336572
## [1] 0.002715363
## [1] "Done"
## [1] "Start normalising input probes"
## ii= 42
## [1] "Fitting EM beta mixture to input probes"
## [1] Inf
## [1] 0.004763955
## [1] 0.002353705
## [1] 0.002791948
## [1] 0.003331998
## [1] "Done"
## [1] "Start normalising input probes"
## ii= 43
## [1] "Fitting EM beta mixture to input probes"
## [1] Inf
## [1] 0.003559414
## [1] 0.001429145
## [1] 0.001053754
## [1] 0.001427752
## [1] "Done"
## [1] "Start normalising input probes"
## ii= 44
## [1] "Fitting EM beta mixture to input probes"
## [1] Inf
## [1] 0.005014635
## [1] 0.002575554
## [1] 0.002400071
## [1] 0.003002924
## [1] "Done"
## [1] "Start normalising input probes"
AMD_Epigenetic_Age_PedBE <- cbind(AMD_Epigenetic_Age_PedBE, PedBE_DNAmAge$DNAmAge....datMethClock.DNAmAgeBuccal)
colnames(AMD_Epigenetic_Age_PedBE)[6] <- "DNAmAgePedBE"
AMD_Epigenetic_Age_PedBE$DNAmPedBEAgeAccelResidual <- lm(DNAmAgePedBE ~ Age,
data = AMD_Epigenetic_Age_PedBE)$residuals#Create function to plot linear regression.
ggplotRegression <- function (fit) {
require(ggplot2)
ggplot(fit$model, aes_string(x = names(fit$model)[2], y = names(fit$model)[1])) +
geom_point(aes(colour = AMD_Epigenetic_Age_PedBE$Sex, shape = AMD_Epigenetic_Age_PedBE$Sample_Group)) +
stat_smooth(method = "lm", col = "red")
}
#Regression plot of Zhang clock predicted DNAmAge against reported age:
AMD_PedBE_plot <- lm(DNAmAgePedBE ~ Age, data = AMD_Epigenetic_Age_PedBE)
ggplotRegression(AMD_PedBE_plot) + labs(x = "Reported age", y = "DNAmPedBE predicted age", caption = "Comparing reported age and predicted age using PedBE epigenetic clock") +
scale_colour_manual(name = "Sex", values = c("#990000", "#2c7dab")) +
scale_shape_manual(name = "Disease State", values = c(20, 23), labels = c("AMD", "Normal")) +
stat_cor(method = "spearman") +
theme_bw() + theme(panel.border = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.line = element_line(colour = "black"),
axis.text = element_text(),
axis.title = element_text(size = 8),
plot.caption = element_text(hjust = 0.5, size = rel(1)))## `geom_smooth()` using formula 'y ~ x'
#Plot of overall disease state vs control:
AMD_Epigenetic_Age_PedBE$Sample_Group <- factor(AMD_Epigenetic_Age_PedBE$Sample_Group)
ggplot(data = AMD_Epigenetic_Age_PedBE,
aes(x = Sample_Group, y = DNAmPedBEAgeAccelResidual, fill = Sample_Group)) +
geom_boxplot(position = position_dodge(0.7), width = 0.5, outlier.shape = NA) +
geom_jitter(aes(fill = Sample_Group), size = 2, shape = 21, alpha = 0.7,
position = position_jitterdodge(dodge.width = 0.75, jitter.width = 0.3)) +
xlab("Disease State") + ylab("DNAmPedBE Acceleration Residual") +
scale_x_discrete(breaks = c("age-related macular degeneration","normal"),
labels = c("Age-Related Macular Degeneration", "Normal")) +
scale_fill_manual(values = c("#77284e", "#ffffff")) +
theme_classic() + theme(
panel.border = element_blank(),
panel.background = element_blank(),
plot.title = element_text(size = 14, family = "Tahoma", face = "bold"),
text = element_text(family = "Tahoma"),
axis.title = element_text(face = "bold"),
axis.text.x = element_text(colour = "black", size = 11),
axis.text.y = element_text(colour = "black", size = 9),
axis.line = element_line(size = 0.5, colour = "black"),
legend.position = "none")#Plot of disease state vs control that is stratified by sex:
AMD_Epigenetic_Age_PedBE$Sex <- factor(AMD_Epigenetic_Age_PedBE$Sex)
ggplot(data = AMD_Epigenetic_Age_PedBE,
aes(x = Sample_Group, y = DNAmPedBEAgeAccelResidual, fill = Sex)) +
geom_boxplot(position = position_dodge(0.7), width = 0.5, outlier.shape = NA) +
geom_jitter(aes(fill = Sex), size = 1, shape = 21, alpha = 0.7,
position = position_jitterdodge(dodge.width = 0.75, jitter.width = 0.3)) +
xlab("Disease State") + ylab("DNAmPedBE Acceleration Residual") +
scale_x_discrete(breaks = c("age-related macular degeneration","normal"),
labels = c("AMD", "Normal")) +
scale_fill_manual(values = c("#990000", "#2c7dab")) +
theme_classic() + theme(
panel.border = element_blank(),
panel.background = element_blank(),
plot.title = element_text(size = 14, family = "Tahoma", face = "bold"),
text = element_text(family = "Tahoma"),
axis.title = element_text(face = "bold"),
axis.text.x = element_text(colour = "black", size = 11),
axis.text.y = element_text(colour = "black", size = 9),
axis.line = element_line(size = 0.5, colour = "black"))#Determine statistical significance between disease state vs control with ANOVA.
summary(lm(DNAmPedBEAgeAccelResidual ~ Sample_Group + Sex + Sex*Sample_Group, data = AMD_Epigenetic_Age_PedBE))##
## Call:
## lm(formula = DNAmPedBEAgeAccelResidual ~ Sample_Group + Sex +
## Sex * Sample_Group, data = AMD_Epigenetic_Age_PedBE)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.3188 -1.0868 0.2439 0.9448 2.6585
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.59339 0.44031 -1.348 0.1854
## Sample_Groupnormal -0.07251 0.68617 -0.106 0.9164
## SexM 1.02608 0.56844 1.805 0.0786 .
## Sample_Groupnormal:SexM -0.01812 0.87272 -0.021 0.9835
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.392 on 40 degrees of freedom
## Multiple R-squared: 0.1226, Adjusted R-squared: 0.05681
## F-statistic: 1.863 on 3 and 40 DF, p-value: 0.1513
summary(aov(DNAmPedBEAgeAccelResidual ~ Sample_Group + Sex + Sex*Sample_Group, data = AMD_Epigenetic_Age_PedBE))## Df Sum Sq Mean Sq F value Pr(>F)
## Sample_Group 1 0.03 0.029 0.015 0.9038
## Sex 1 10.81 10.808 5.575 0.0232 *
## Sample_Group:Sex 1 0.00 0.001 0.000 0.9835
## Residuals 40 77.55 1.939
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Subset relevant epigenetic age variables.
AMD_Epigenetic_Age_Grim <- AMD_pData[, c("Sample_Name", "Sample_Group", "Array", "Slide", "Sex", "predictedGender", "Tissue", "predictedTissue", "Age", "DNAmGrimAge", "AgeAccelGrim")]
#Create function to plot linear regression.
ggplotRegression <- function (fit) {
require(ggplot2)
ggplot(fit$model, aes_string(x = names(fit$model)[2], y = names(fit$model)[1])) +
geom_point(aes(colour = AMD_Epigenetic_Age_Grim$Sex, shape = AMD_Epigenetic_Age_Grim$Sample_Group, size = 1)) +
guides(size = FALSE) +
stat_smooth(method = "lm", col = "red")
}
#Regression plot of Grim predicted DNAmAge against reported age:
AMD_Grim_plot <- lm(DNAmGrimAge ~ Age, data = AMD_Epigenetic_Age_Grim)
ggplotRegression(AMD_Grim_plot) + labs(x = "Reported age", y = "DNAmGrimAge predicted age", caption = "Comparing reported age and predicted age using GrimAge epigenetic clock") +
scale_colour_manual(name = "Sex", values = c("#990000", "#2c7dab"), labels = c("Female", "Male")) +
scale_shape_manual(name = "Disease State", values = c(20, 23), labels = c("AMD", "Normal")) +
stat_cor(method = "spearman") +
theme_bw() + theme(panel.border = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.line = element_line(colour = "black"),
axis.text = element_text(),
axis.title = element_text(size = 8),
plot.caption = element_text(hjust = 0.5, size = rel(1)))## `geom_smooth()` using formula 'y ~ x'
#Plot of overall disease state vs control:
AMD_Epigenetic_Age_Grim$Sample_Group <- factor(AMD_Epigenetic_Age_Grim$Sample_Group)
ggplot(data = AMD_Epigenetic_Age_Grim,
aes(x = Sample_Group, y = AgeAccelGrim, fill = Sample_Group)) +
geom_boxplot(position = position_dodge(0.7), width = 0.5, outlier.shape = NA) +
geom_jitter(aes(fill = Sample_Group), size = 2, shape = 21, alpha = 0.7,
position = position_jitterdodge(dodge.width = 0.75, jitter.width = 0.3)) +
xlab("Disease State") + ylab("DNAmGrimAge Acceleration Residual") +
scale_x_discrete(breaks = c("age-related macular degeneration","normal"),
labels = c("Age-Related Macular Degeneration", "Normal")) +
scale_fill_manual(values = c("#77284e", "#ffffff")) +
theme_classic() + theme(
panel.border = element_blank(),
panel.background = element_blank(),
plot.title = element_text(size = 14, family = "Tahoma", face = "bold"),
text = element_text(family = "Tahoma"),
axis.title = element_text(face = "bold"),
axis.text.x = element_text(colour = "black", size = 11),
axis.text.y = element_text(colour = "black", size = 9),
axis.line = element_line(size = 0.5, colour = "black"),
legend.position = "none")#Plot of disease state vs control that is stratified by sex:
AMD_Epigenetic_Age_Grim$Sex <- factor(AMD_Epigenetic_Age_Grim$Sex)
ggplot(data = AMD_Epigenetic_Age_Grim,
aes(x = Sample_Group, y = AgeAccelGrim, fill = Sex)) +
geom_boxplot(position = position_dodge(0.7), width = 0.5, outlier.shape = NA) +
geom_jitter(aes(fill = Sex), size = 2, shape = 21, alpha = 0.7,
position = position_jitterdodge(dodge.width = 0.75, jitter.width = 0.3)) +
xlab("Disease State") + ylab("DNAmGrimAge Acceleration Residual") +
scale_x_discrete(breaks = c("age-related macular degeneration","normal"),
labels = c("AMD", "Normal")) +
scale_fill_manual(values = c("#990000", "#2c7dab")) +
theme_classic() + theme(
panel.border = element_blank(),
panel.background = element_blank(),
plot.title = element_text(size = 14, family = "Tahoma", face = "bold"),
text = element_text(family = "Tahoma"),
axis.title = element_text(face = "bold"),
axis.text.x = element_text(colour = "black", size = 11),
axis.text.y = element_text(colour = "black", size = 9),
axis.line = element_line(size = 0.5, colour = "black"))#Determine statistical significance between disease state vs control with ANOVA.
summary(lm(AgeAccelGrim ~ Sample_Group + Sex + Sex*Sample_Group, data = AMD_Epigenetic_Age_Grim))##
## Call:
## lm(formula = AgeAccelGrim ~ Sample_Group + Sex + Sex * Sample_Group,
## data = AMD_Epigenetic_Age_Grim)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.2295 -1.1402 0.1506 1.6160 4.2953
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.83212 0.67959 -2.696 0.01022 *
## Sample_Groupnormal 0.77921 1.05906 0.736 0.46617
## SexM 2.47190 0.87734 2.817 0.00748 **
## Sample_Groupnormal:SexM -0.07774 1.34698 -0.058 0.95426
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.149 on 40 degrees of freedom
## Multiple R-squared: 0.2721, Adjusted R-squared: 0.2175
## F-statistic: 4.984 on 3 and 40 DF, p-value: 0.004958
summary(aov(AgeAccelGrim ~ Sample_Group + Sex + Sex*Sample_Group, data = AMD_Epigenetic_Age_Grim))## Df Sum Sq Mean Sq F value Pr(>F)
## Sample_Group 1 7.05 7.05 1.527 0.223816
## Sex 1 61.99 61.99 13.422 0.000721 ***
## Sample_Group:Sex 1 0.02 0.02 0.003 0.954264
## Residuals 40 184.73 4.62
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
The Weidner clock is developed from an initial 82 blood samples, which are then used to generate a multivariate model. Use the beta values at the following three CpG sites ((α) cg02228185; (β) cg25809905, and (γ) a CpG site upstream of cg17861230) to generate age-prediction, based on the equation from the model below.
Predicted age (in years) = 38.0 - 26.4α - 23.7β + 164.7γ
AMD_Epigenetic_Age_Weidner <- AMD_samplesheet[, c("Sample_Name", "Sample_Group", "Tissue", "Sex", "Age")]
rownames(AMD_Epigenetic_Age_Weidner) <- AMD_Epigenetic_Age_Weidner$Sample_Name
AMD_betas.Weidner <- getBeta(AMD_MSet.noob)
#Subset the 3 CpG sites specified by the clock.
Weidner_sites <- AMD_betas.Weidner[rownames(AMD_betas.Weidner) %in% c("cg02228185", "cg25809905", "cg17861230"),]
Weidner_sites <- as.data.frame(t(Weidner_sites))
#Check sample order.
identical(rownames(AMD_Epigenetic_Age_Weidner), rownames(Weidner_sites)) #TRUE.## [1] TRUE
#Calculate Weidner epigenetic age.
AMD_Epigenetic_Age_Weidner$DNAmAgeWeidner <- 38.0 - 26.4*(Weidner_sites$cg02228185) - 23.7*(Weidner_sites$cg25809905) + 164.7*(Weidner_sites$cg17861230)
#Calculate Weidner age acceleration residual.
AMD_Epigenetic_Age_Weidner$AgeAccelResidualWeidner <- lm(DNAmAgeWeidner ~ Age, data = AMD_Epigenetic_Age_Weidner)$residual#Create function to plot linear regression.
ggplotRegression <- function (fit) {
require(ggplot2)
ggplot(fit$model, aes_string(x = names(fit$model)[2], y = names(fit$model)[1])) +
geom_point(aes(colour = AMD_Epigenetic_Age_Weidner$Sex, shape = AMD_Epigenetic_Age_Weidner$Sample_Group)) +
stat_smooth(method = "lm", col = "red")
}
#Regression plot of Zhang clock predicted DNAmAge against reported age:
AMD_Weidner_plot <- lm(DNAmAgeWeidner ~ Age, data = AMD_Epigenetic_Age_Weidner)
ggplotRegression(AMD_Weidner_plot) + labs(x = "Reported age", y = "DNAmWeidner predicted age", caption = "Comparing reported age and predicted age using DNAmWeidner epigenetic clock") +
scale_colour_manual(name = "Sex", values = c("#990000", "#2c7dab")) +
scale_shape_manual(name = "Disease State", values = c(20, 23), labels = c("AMD", "Normal")) +
stat_cor(method = "spearman") +
theme_bw() + theme(panel.border = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.line = element_line(colour = "black"),
axis.text = element_text(),
axis.title = element_text(size = 8),
plot.caption = element_text(hjust = 0.5, size = rel(1)))## `geom_smooth()` using formula 'y ~ x'
#Plot of overall disease state vs control:
AMD_Epigenetic_Age_Weidner$Sample_Group <- factor(AMD_Epigenetic_Age_Weidner$Sample_Group)
ggplot(data = AMD_Epigenetic_Age_Weidner,
aes(x = Sample_Group, y = AgeAccelResidualWeidner, fill = Sample_Group)) +
geom_boxplot(position = position_dodge(0.7), width = 0.5, outlier.shape = NA) +
geom_jitter(aes(fill = Sample_Group), size = 2, shape = 21, alpha = 0.7,
position = position_jitterdodge(dodge.width = 0.75, jitter.width = 0.3)) +
xlab("Disease State") + ylab("DNAmWeidner Acceleration Residual") +
scale_x_discrete(breaks = c("age-related macular degeneration","normal"),
labels = c("Age-Related Macular Degeneration", "Normal")) +
scale_fill_manual(values = c("#77284e", "#ffffff")) +
theme_classic() + theme(
panel.border = element_blank(),
panel.background = element_blank(),
plot.title = element_text(size = 14, family = "Tahoma", face = "bold"),
text = element_text(family = "Tahoma"),
axis.title = element_text(face = "bold"),
axis.text.x = element_text(colour = "black", size = 11),
axis.text.y = element_text(colour = "black", size = 9),
axis.line = element_line(size = 0.5, colour = "black"),
legend.position = "none")#Plot of disease state vs control that is stratified by sex:
AMD_Epigenetic_Age_Weidner$Sex <- factor(AMD_Epigenetic_Age_Weidner$Sex)
ggplot(data = AMD_Epigenetic_Age_Weidner,
aes(x = Sample_Group, y = AgeAccelResidualWeidner, fill = Sex)) +
geom_boxplot(position = position_dodge(0.7), width = 0.5, outlier.shape = NA) +
geom_jitter(aes(fill = Sex), size = 1, shape = 21, alpha = 0.7,
position = position_jitterdodge(dodge.width = 0.75, jitter.width = 0.3)) +
xlab("Disease State") + ylab("DNAmWeidner Acceleration Residual") +
scale_x_discrete(breaks = c("age-related macular degeneration","normal"),
labels = c("AMD", "Normal")) +
scale_fill_manual(values = c("#990000", "#2c7dab")) +
theme_classic() + theme(
panel.border = element_blank(),
panel.background = element_blank(),
plot.title = element_text(size = 14, family = "Tahoma", face = "bold"),
text = element_text(family = "Tahoma"),
axis.title = element_text(face = "bold"),
axis.text.x = element_text(colour = "black", size = 11),
axis.text.y = element_text(colour = "black", size = 9),
axis.line = element_line(size = 0.5, colour = "black"))#Determine statistical significance between disease state vs control with ANOVA.
summary(lm(AgeAccelResidualWeidner ~ Sample_Group + Sex + Sex*Sample_Group, data = AMD_Epigenetic_Age_Weidner))##
## Call:
## lm(formula = AgeAccelResidualWeidner ~ Sample_Group + Sex + Sex *
## Sample_Group, data = AMD_Epigenetic_Age_Weidner)
##
## Residuals:
## Min 1Q Median 3Q Max
## -21.5349 -8.6345 0.1477 6.4664 26.0141
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.816 3.612 -1.057 0.297
## Sample_Groupnormal 1.397 5.628 0.248 0.805
## SexM 7.108 4.663 1.524 0.135
## Sample_Groupnormal:SexM -4.214 7.159 -0.589 0.559
##
## Residual standard error: 11.42 on 40 degrees of freedom
## Multiple R-squared: 0.06317, Adjusted R-squared: -0.007092
## F-statistic: 0.8991 on 3 and 40 DF, p-value: 0.4501
summary(aov(AgeAccelResidualWeidner ~ Sample_Group + Sex + Sex*Sample_Group, data = AMD_Epigenetic_Age_Weidner))## Df Sum Sq Mean Sq F value Pr(>F)
## Sample_Group 1 12 11.67 0.089 0.766
## Sex 1 295 294.96 2.261 0.141
## Sample_Group:Sex 1 45 45.20 0.346 0.559
## Residuals 40 5218 130.44
#Combine all epigenetic clock outputs into a summary table.
AMD_Epigenetic_All <- join_all(list(AMD_Epigenetic_Age_Horvath,
AMD_Epigenetic_Age_Zhang,
AMD_Epigenetic_Age_Hannum,
AMD_Epigenetic_AgeSkinBlood,
AMD_Epigenetic_Age_Cortical,
AMD_Epigenetic_Age_PedBE,
AMD_Epigenetic_Age_Weidner),
by = "Sample_Name", type = "left")
AMD_Epigenetic_All <- AMD_Epigenetic_All[, !duplicated(colnames(AMD_Epigenetic_All))]
#Change column names to reflect outputs of various clocks.
colnames(AMD_Epigenetic_All)[10:26] <- c("HorvathAge", "HorvathAgeAccelDiff", "HorvathAgeAccelResidual",
"ZhangENAge", "ZhangENAgeAccelResidual", "ZhangBLUPAge",
"ZhangBLUPAgeAccelResidual", "HannumAge", "HannumAgeAccelResidual",
"SkinBloodAge", "SkinBloodAgeAccelResidual", "CorticalAge",
"CorticalAgeAccelResidual", "PedBEAge", "PedBEAgeAccelResidual",
"WeidnerAge", "WeidnerAgeAccelResidual")Plot all epigenetic clock outputs.
#Melt wide form data to long form.
AMD_clocks.all <- AMD_Epigenetic_All[, c(1:10, 21, 17, 13, 15, 25, 19)]
AMD_clocks.melt <- melt(AMD_clocks.all)## Warning in melt(AMD_clocks.all): The melt generic in data.table has been
## passed a data.frame and will attempt to redirect to the relevant reshape2
## method; please note that reshape2 is deprecated, and this redirection is now
## deprecated as well. To continue using melt methods from reshape2 while both
## libraries are attached, e.g. melt.list, you can prepend the namespace like
## reshape2::melt(AMD_clocks.all). In the next version, this warning will become an
## error.
## Using Sample_Name, Sample_Group, Array, Slide, Sex, predictedGender, Tissue, predictedTissue as id variables
colnames(AMD_clocks.melt)[c(9:10)] <- c("Epigenetic_clocks", "Age")
#Plot outputs together.
ggplot(data = AMD_clocks.melt, aes(x = Epigenetic_clocks, y = Age, fill = Epigenetic_clocks)) +
geom_boxplot(position = position_dodge(0.7), width = 0.5, outlier.shape = NA) +
xlab("\nEpigenetic Clocks Outputs") + ylab(element_blank()) +
theme_classic() + theme(
panel.border = element_blank(),
panel.background = element_blank(),
plot.title = element_text(size = 14, family = "Tahoma", face = "bold"),
text = element_text(family = "Tahoma"),
axis.title = element_text(face = "bold"),
axis.text.x = element_text(colour = "black", size = 11),
axis.text.y = element_text(colour = "black", size = 9),
axis.line = element_line(size = 0.5, colour = "black"),
legend.position = "none")#Calculate RMSE.
rmse(AMD_clocks.all$Age, AMD_clocks.all$HorvathAge)## [1] 41.21593
rmse(AMD_clocks.all$Age, AMD_clocks.all$HannumAge)## [1] 74.5432
rmse(AMD_clocks.all$Age, AMD_clocks.all$CorticalAge)## [1] 74.32903
rmse(AMD_clocks.all$Age, AMD_clocks.all$ZhangBLUPAge)## [1] 74.27721
rmse(AMD_clocks.all$Age, AMD_clocks.all$ZhangENAge)## [1] 20.60796
rmse(AMD_clocks.all$Age, AMD_clocks.all$SkinBloodAge)## [1] 74.34481
rmse(AMD_clocks.all$Age, AMD_clocks.all$WeidnerAge)## [1] 43.93984
Check correlation between predicted ages of various clocks.
#Create correlation matrix.
AMD_clocks.cor <- AMD_clocks.all[, c(9:16)]
AMD_clocks.cor <- round(cor(AMD_clocks.cor), 2)
#Plot paired matrix.
library(psych)##
## Attaching package: 'psych'
## The following object is masked from 'package:Hmisc':
##
## describe
## The following object is masked from 'package:genefilter':
##
## AUC
## The following object is masked from 'package:ROC':
##
## AUC
## The following object is masked from 'package:IRanges':
##
## reflect
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
## The following objects are masked from 'package:scales':
##
## alpha, rescale
pairs.panels(AMD_clocks.cor,
method = "pearson", # correlation method
hist.col = "#00AFBB",
density = FALSE, # show density plots
ellipses = FALSE # show correlation ellipses
)For Horvath’s clock and Cortical Clock, bin the data to see progression of age acceleration.
#Look at age and predicted age distributions.
summary(AMD_Epigenetic_All$Age)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 50.00 70.00 74.00 73.66 79.00 89.00
summary(AMD_Epigenetic_All$HorvathAge)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 15.32 28.86 33.66 33.09 36.81 50.76
summary(AMD_Epigenetic_All$CorticalAge)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -13.6134 -3.9075 0.8102 0.0000 4.0725 14.7770
#Quickly visualise age distribution.
hist(AMD_Epigenetic_All$Age, main = "Chronological Age Distribution", xlab = "Chronological Age")#Horvath's clock:
#Bin epigenetic age acceleration difference according to chronological age.
#Define bucket intervals and name them.
tags <- c("Under 60", "60-69", "70-79", "Above 80")
#Place chronological ages into the buckets. Store groups as a new column.
v <- AMD_Epigenetic_All %>% select(Age, HorvathAgeAccelResidual) #pick the variable
vgroup <- as_tibble(v) %>% mutate(tag = case_when(
Age < 60 ~ tags[1],
Age >= 60 & Age < 70 ~ tags[2],
Age >= 70 & Age < 80 ~ tags[3],
Age >= 80 ~ tags[4]
))
summary(vgroup)## Age HorvathAgeAccelResidual tag
## Min. :50.00 Min. :-14.08932 Length:44
## 1st Qu.:70.00 1st Qu.: -3.51174 Class :character
## Median :74.00 Median : -0.07837 Mode :character
## Mean :73.66 Mean : 0.00000
## 3rd Qu.:79.00 3rd Qu.: 3.32197
## Max. :89.00 Max. : 15.09633
#Factorise bins
vgroup$tag <- factor(vgroup$tag, levels = tags, ordered = FALSE)
summary(vgroup$tag)## Under 60 60-69 70-79 Above 80
## 2 8 24 10
#Visualise the binned data.
ggplot(data = vgroup, mapping = aes(x = tag, y = HorvathAgeAccelResidual)) +
geom_jitter(aes(color = 'blue'), alpha = 0.2) +
geom_boxplot(fill = "#BB9D00", color = "black", alpha = 0.3) +
labs(x = "Chronological Age Deciles", y = "Horvath's Clock Age Acceleration") +
guides(color = FALSE) +
theme_minimal() #Cortical clock:
#Bin epigenetic age acceleration difference according to chronological age.
#Calculate cortical clock age acceleration difference.
AMD_Epigenetic_All$CorticalAgeAccelDiff <- AMD_Epigenetic_All$CorticalAge - AMD_Epigenetic_All$Age
#Define bucket intervals and name them.
tags <- c("Under 60", "60-69", "70-79", "Above 80")
#Place chronological ages into the buckets. Store groups as a new column.
v <- AMD_Epigenetic_All %>% select(Age, CorticalAgeAccelResidual) #pick the variable
vgroup <- as_tibble(v) %>% mutate(tag = case_when(
Age < 60 ~ tags[1],
Age >= 60 & Age < 70 ~ tags[2],
Age >= 70 & Age < 80 ~ tags[3],
Age >= 80 ~ tags[4]
))
summary(vgroup)## Age CorticalAgeAccelResidual tag
## Min. :50.00 Min. :-10.55257 Length:44
## 1st Qu.:70.00 1st Qu.: -3.90748 Class :character
## Median :74.00 Median : 0.81019 Mode :character
## Mean :73.66 Mean : 0.03524
## 3rd Qu.:79.00 3rd Qu.: 4.07247
## Max. :89.00 Max. : 12.17295
#Factorise bins
vgroup$tag <- factor(vgroup$tag, levels = tags, ordered = FALSE)
summary(vgroup$tag)## Under 60 60-69 70-79 Above 80
## 2 8 24 10
#Visualise the binned data.
ggplot(data = vgroup, mapping = aes(x = tag, y = CorticalAgeAccelResidual)) +
geom_jitter(aes(color = 'blue'), alpha = 0.2) +
geom_boxplot(fill = "#00B81F", color = "black", alpha = 0.3) +
labs(x = "Chronological Age Deciles", y = "Cortical Clock Age Acceleration") +
guides(color = FALSE) +
theme_minimal()library(VennDiagram)
library(grid)
#Load epigenetic clock CpGs.
Epigenetic_clocks_CpGs <- read.csv("~/KoborLab/kobor_space/kendrix/macular_degeneration/data/Epigenetic_clocks_CpGs.csv",
header = TRUE)
Epigenetic_clocks_CpGs[Epigenetic_clocks_CpGs==""] <- NA
DNAmHorvath <- na.omit(as.character(Epigenetic_clocks_CpGs$DNAmHorvath))
DNAmHannum <- na.omit(as.character(Epigenetic_clocks_CpGs$DNAmHannum))
DNAmCortical <- na.omit(as.character(Epigenetic_clocks_CpGs$DNAmCortical))
DNAmSkinBlood <- na.omit(as.character(Epigenetic_clocks_CpGs$DNAmSkinBlood))
DNAmZhang <- na.omit(as.character(Epigenetic_clocks_CpGs$DNAmZhang))
DNAmWeidner <- na.omit(as.character(Epigenetic_clocks_CpGs$DNAmWeidner))
#Check similarity between DNAmWeidner CpGs with the other clocks.
Epigenetic_clocks_CpGs[Epigenetic_clocks_CpGs$DNAmHorvath %in% DNAmWeidner,]$DNAmHorvath
Epigenetic_clocks_CpGs[Epigenetic_clocks_CpGs$DNAmHannum %in% DNAmWeidner,]$DNAmHannum
Epigenetic_clocks_CpGs[Epigenetic_clocks_CpGs$DNAmCortical %in% DNAmWeidner,]$DNAmCortical
Epigenetic_clocks_CpGs[Epigenetic_clocks_CpGs$DNAmSkinBlood %in% DNAmWeidner,]$DNAmSkinBlood
Epigenetic_clocks_CpGs[Epigenetic_clocks_CpGs$DNAmZhang %in% DNAmWeidner,]$DNAmZhang
Epigenetic_clocks_CpGs[Epigenetic_clocks_CpGs$DNAmPhenoAge %in% DNAmWeidner,]$DNAmPhenoAge
#Make Venn diagram.
x <- list(DNAmHorvath, DNAmHannum, DNAmZhang, DNAmSkinBlood, DNAmCortical)
setwd("~/KoborLab/kobor_space/kendrix/macular_degeneration/plots/")
futile.logger::flog.threshold(futile.logger::ERROR, name = "VennDiagramLogger")
venn_cpg <- venn.diagram(x = x,
category.names = c("DNAmHorvath" , "DNAmHannum", "DNAmZhang", "DNAmSkinBlood", "DNAmCortical"),
filename = NULL,
cat.cex = 0.8, cat.fontface = "bold", cat.default.pos = "inner",
cat.pos = c(10, 25, 55, 185, 190),
cat.dist = c(0.08, 0.08, 0.08, 0.08, 0.08),
scale = FALSE
)
grid.newpage()
grid.draw(venn_cpg)Find overlap between age-associated CpGs of Horvath’s and Cortical clocks.
library(VennDiagram)## Loading required package: futile.logger
## Warning: package 'futile.logger' was built under R version 3.6.3
##
## Attaching package: 'futile.logger'
## The following object is masked from 'package:mgcv':
##
## scat
##
## Attaching package: 'VennDiagram'
## The following object is masked from 'package:ggpubr':
##
## rotate
## The following object is masked from 'package:ape':
##
## rotate
## The following object is masked from 'package:dendextend':
##
## rotate
library(grid)
#Load epigenetic clock CpGs.
Epigenetic_clocks_CpGs <- read.csv("~/KoborLab/kobor_space/kendrix/macular_degeneration/data/Epigenetic_clocks_CpGs.csv",
header = TRUE)
Epigenetic_clocks_CpGs[Epigenetic_clocks_CpGs==""] <- NA
DNAmHorvath <- na.omit(as.character(Epigenetic_clocks_CpGs$DNAmHorvath))
DNAmCortical <- na.omit(as.character(Epigenetic_clocks_CpGs$DNAmCortical))
#Make Venn diagram.
x <- list(DNAmHorvath, DNAmCortical)
setwd("~/KoborLab/kobor_space/kendrix/macular_degeneration/plots/")
futile.logger::flog.threshold(futile.logger::ERROR, name = "VennDiagramLogger")## NULL
venn_overlap <- venn.diagram(x = x,
category.names = c("DNAmHorvath" , "DNAmCortical"),
filename = NULL,
cat.cex = 1, cat.fontface = "bold", cat.default.pos = "outer",
cat.pos = c(-25, 25),
cat.dist = c(0.04, 0.04),
scaled = FALSE
)
grid.newpage()
grid.draw(venn_overlap) #5 overlapping CpGs. #Find out what the overlapping CpGs are.
overlap_cpgs <- intersect(DNAmHorvath, DNAmCortical)
#Load 450K annotation data.
load("~/KoborLab/kobor_space/shared_coding_resource/Illumina_EPIC/EPIC_Annotation_Complete.RData")
EPIC_Annotation_Complete[which(EPIC_Annotation_Complete$Name %in% overlap_cpgs),]## Name AddressA_ID
## cg21801378 cg21801378 0035637875
## cg23124451 cg23124451 0092806941
## cg19724470 cg19724470 0086696312
## cg08370996 cg08370996 0076666492
## cg06144905 cg06144905 0084705969
## AlleleA_ProbeSeq AddressB_ID
## cg21801378 TAATAACATCATAATCCTTCATAAATACAACAAAACCAAAATTTAACCCA 0091713156
## cg23124451 AAAATACTAATAACRACCRAACCTACCTAAACTAAAAATTAAAATCRAAC
## cg19724470 ACTTACACAAATCATATTACTAACAAACAACTAACRATAAACTTAAACCC
## cg08370996 CCCCCTTTTTAACATATTTAATCACTTTAATTCTCTATTCTTTTCTCTCC
## cg06144905 TAATTTTCCCCTCTTCAACCTAATTTACTTTAAAAATTCTAAAAAACCCC
## AlleleB_ProbeSeq
## cg21801378 TAATAACGTCGTAATCCTTCATAAATACGACGAAACCGAAATTTAACCCG
## cg23124451
## cg19724470
## cg08370996
## cg06144905
## Infinium_Design_Type Next_Base Color_Channel
## cg21801378 I C Grn
## cg23124451 II
## cg19724470 II
## cg08370996 II
## cg06144905 II
## Forward_Sequence
## cg21801378 CCACGAAGAGCTTGATGGCGTCGTGGTCCTTCATGGGTACGGCGGGACCGGGGTTTAGCC[CG]CTCATGCCGACGCCGCTGTCCGCGGTGCTGAAACCCAGGCGCGGGCCGGGGCCAGCGGGC
## cg23124451 TCAGTCTCCCCATATTTACAATAAAAGGGGAGCGAGGTGGGATGGCGCTGAGGATCCCTA[CG]TCCGATCCTAATCTCCAGCTCAGGCAGGCTCGGCCGCCACTAGCATCCTGGAGCGACAAC
## cg19724470 CATTCTTATGCGACTGTGTGTTCAGAATATAGCTCTGATGCTAGGCTGGAGGTCTGGACA[CG]GGTCCAAGTCCACCGCCAGCTGCTTGCTAGTAACATGACTTGTGTAAGTTATCCCAGCTG
## cg08370996 CCCTCCCGCGCCCCCCTTTTTAGCATATTTGATCACTTTGATTCTCTGTTCTTTTCTCTC[CG]CGGTGTGTGTGTGCGTGCGCGCGTGTGTGTTTTCTTCTTCTCCTCCTCCTCTCCCCGAGT
## cg06144905 CTGACCTCACCACCCACCAGGGAGGTGGGTCTTATTCTGGGCATCGTGCCAAGTTCTTAG[CG]GGGCCCTCTAGAATCTCTAAAGCAAATCAGGCTGAAGAGGGGAAAACCAGCAGGGGGAGG
## Genome_Build CHR MAPINFO
## cg21801378 37 15 72612125
## cg23124451 37 22 39548131
## cg19724470 37 9 5450936
## cg08370996 37 15 96874031
## cg06144905 37 17 27369780
## SourceSeq Strand
## cg21801378 CGGGCTAAACCCCGGTCCCGCCGTACCCATGAAGGACCACGACGCCATCA R
## cg23124451 CGTCCGATCCTAATCTCCAGCTCAGGCAGGCTCGGCCGCCACTAGCATCC F
## cg19724470 CGGGTCCAAGTCCACCGCCAGCTGCTTGCTAGTAACATGACTTGTGTAAG F
## cg08370996 CCCCTTTTTAGCATATTTGATCACTTTGATTCTCTGTTCTTTTCTCTCCG R
## cg06144905 CGGGGCCCTCTAGAATCTCTAAAGCAAATCAGGCTGAAGAGGGGAAAACC F
## UCSC_RefGene_Name UCSC_RefGene_Accession UCSC_RefGene_Group
## cg21801378 BRUNOL6 NM_052840 1stExon
## cg23124451 CBX7 NM_175709 Body
## cg19724470 CD274 NM_014143 5'UTR
## cg08370996 NR2F2;NR2F2 NM_021005;NM_001145155 TSS200;Body
## cg06144905 PIPOX NM_016518 TSS200
## UCSC_CpG_Islands_Name Relation_to_UCSC_CpG_Island
## cg21801378 chr15:72611946-72612802 Island
## cg23124451 chr22:39548102-39548432 Island
## cg19724470 chr9:5450409-5450629 S_Shore
## cg08370996 chr15:96873408-96877721 Island
## cg06144905
## Phantom4_Enhancers Phantom5_Enhancers DMR X450k_Enhancer
## cg21801378 NA
## cg23124451 NA
## cg19724470 NA
## cg08370996 high-CpG:94674791-94675270 NA
## cg06144905 NA
## HMM_Island Regulatory_Feature_Name
## cg21801378 15:70399042-70400040 15:72611781-72613209
## cg23124451 22:37877959-37879580
## cg19724470 9:5449680-5451039
## cg08370996 15:94674413-94676109 15:96873561-96874038
## cg06144905
## Regulatory_Feature_Group
## cg21801378 Promoter_Associated
## cg23124451
## cg19724470 Promoter_Associated
## cg08370996 NonGene_Associated_Cell_type_specific
## cg06144905
## GencodeBasicV12_NAME
## cg21801378 CELF6;CELF6;CELF6;CELF6;RP11-106M3.3;RP11-106M3.2
## cg23124451
## cg19724470 CD274;CD274
## cg08370996 NR2F2;NR2F2
## cg06144905 PIPOX
## GencodeBasicV12_Accession
## cg21801378 ENST00000539635.1;ENST00000567083.1;ENST00000287202.5;ENST00000539635.1;ENST00000570175.1;ENST00000379915.4
## cg23124451
## cg19724470 ENST00000381573.4;ENST00000381577.3
## cg08370996 ENST00000394166.3;ENST00000394166.3
## cg06144905 ENST00000323372.4
## GencodeBasicV12_Group
## cg21801378 1stExon;1stExon;1stExon;5'UTR;5'UTR;3'UTR
## cg23124451
## cg19724470 5'UTR;5'UTR
## cg08370996 5'UTR;1stExon
## cg06144905 TSS200
## GencodeCompV12_NAME
## cg21801378 CELF6;CELF6;CELF6;CELF6;CELF6;RP11-106M3.3;RP11-106M3.2
## cg23124451 CBX7;CBX7
## cg19724470 CD274;CD274
## cg08370996 NR2F2;NR2F2
## cg06144905 PIPOX;PIPOX;PIPOX
## GencodeCompV12_Accession
## cg21801378 ENST00000539635.1;ENST00000567083.1;ENST00000569547.1;ENST00000287202.5;ENST00000539635.1;ENST00000570175.1;ENST00000379915.4
## cg23124451 ENST00000477827.1;ENST00000482294.1
## cg19724470 ENST00000381573.4;ENST00000381577.3
## cg08370996 ENST00000394166.3;ENST00000394166.3
## cg06144905 ENST00000323372.4;ENST00000466889.1;ENST00000469082.1
## GencodeCompV12_Group
## cg21801378 1stExon;1stExon;1stExon;1stExon;5'UTR;5'UTR;3'UTR
## cg23124451 3'UTR;3'UTR
## cg19724470 5'UTR;5'UTR
## cg08370996 5'UTR;1stExon
## cg06144905 TSS200;TSS1500;TSS1500
## DNase_Hypersensitivity_NAME DNase_Hypersensitivity_Evidence_Count
## cg21801378 chr15:72611880-72612415 3
## cg23124451 NA
## cg19724470 chr9:5450860-5451110 3
## cg08370996 chr15:96873300-96875030 3
## cg06144905 chr17:27369445-27370195 3
## OpenChromatin_NAME OpenChromatin_Evidence_Count
## cg21801378 NA
## cg23124451 chr22:39546864-39549854 6
## cg19724470 NA
## cg08370996 NA
## cg06144905 NA
## TFBS_NAME TFBS_Evidence_Count Methyl27_Loci
## cg21801378 NA TRUE
## cg23124451 chr22:39547710-39549680 6 TRUE
## cg19724470 NA TRUE
## cg08370996 NA TRUE
## cg06144905 NA TRUE
## Methyl450_Loci Chromosome_36 Coordinate_36
## cg21801378 TRUE 15 70399179
## cg23124451 TRUE 22 37878077
## cg19724470 TRUE 9 5440936
## cg08370996 TRUE 15 94675035
## cg06144905 TRUE 17 24393906
## SNP_ID
## cg21801378 rs115738898;rs141628783;rs150517327;rs368237855;rs574078216
## cg23124451
## cg19724470 rs185005359;rs866066;rs559393240;rs532929011;rs551514059;rs190049644
## cg08370996
## cg06144905 rs202118547;rs570473003;rs536095412
## SNP_DISTANCE
## cg21801378 30;29;27;14;1
## cg23124451
## cg19724470 12;17;23;33;34;40
## cg08370996
## cg06144905 1;1;2
## SNP_MinorAlleleFrequency Random_Loci X
## cg21801378 0.010982;0.000246;0.000247;0.000200;0.000200 NA NA
## cg23124451 NA NA
## cg19724470 0.000200;0.139477;0.000799;0.000200;0.000200;0.000200 NA NA
## cg08370996 NA NA
## cg06144905 0.016374;0.000200;0.001198 NA NA
## strand Probe_rs Probe_maf CpG_rs CpG_maf SBE_rs SBE_maf CH_450_XY
## cg21801378 - rs115738898 0.010982 <NA> NA <NA> NA No
## cg23124451 + <NA> NA <NA> NA <NA> NA No
## cg19724470 + rs866066 0.278954 <NA> NA <NA> NA No
## cg08370996 - <NA> NA <NA> NA <NA> NA No
## cg06144905 + <NA> NA <NA> NA <NA> NA No
## CH_450_Aut CH_EPIC Cross_Hyb
## cg21801378 No No No
## cg23124451 No No No
## cg19724470 No No No
## cg08370996 No No No
## cg06144905 No No No
EPIC_Annotation_Complete[which(EPIC_Annotation_Complete$Name %in% overlap_cpgs),]$UCSC_RefGene_Name## [1] BRUNOL6 CBX7 CD274 NR2F2;NR2F2 PIPOX
## 66070 Levels: A1BG A1BG-AS1;A1BG A1BG-AS1;A1BG;ZNF497;ZNF497 ... ZZZ3;ZZZ3;ZZZ3
Find overlap between age-associated clock CpGs and EWAS hits.
#Load epigenetic clock CpGs.
EWAS_hits <- read.csv("~/KoborLab/kobor_space/kendrix/macular_degeneration/data/EWAS_hits.csv", header = TRUE)
#Find overlap.
Epigenetic_clocks_CpGs[Epigenetic_clocks_CpGs$DNAmHorvath %in% EWAS_hits$EWAS_hits,]$DNAmHorvath## factor(0)
## 354 Levels: cg00075967 cg00091693 cg00168942 cg00374717 ... cg27544190
Epigenetic_clocks_CpGs[Epigenetic_clocks_CpGs$DNAmHannum %in% EWAS_hits$EWAS_hits,]$DNAmHannum## factor(0)
## 72 Levels: cg00481951 cg00486113 cg00748589 cg01528542 ... ch.2.30415474F
Epigenetic_clocks_CpGs[Epigenetic_clocks_CpGs$DNAmCortical %in% EWAS_hits$EWAS_hits,]$DNAmCortical## factor(0)
## 348 Levels: cg00059225 cg00088042 cg00252534 cg00297950 ... ch.2.71774667F
Epigenetic_clocks_CpGs[Epigenetic_clocks_CpGs$DNAmSkinBlood %in% EWAS_hits$EWAS_hits,]$DNAmSkinBlood## factor(0)
## 392 Levels: cg00017842 cg00073460 cg00194146 cg00257455 ... cg27544190
Epigenetic_clocks_CpGs[Epigenetic_clocks_CpGs$DNAmZhang %in% EWAS_hits$EWAS_hits,]$DNAmZhang## [1] cg22197050
## 514 Levels: cg00096065 cg00241664 cg00271522 cg00302979 ... cg27640528
Epigenetic_clocks_CpGs[Epigenetic_clocks_CpGs$DNAmPhenoAge %in% EWAS_hits$EWAS_hits,]$DNAmPhenoAge## factor(0)
## 514 Levels: cg00079056 cg00083937 cg00113951 cg00168942 ... cg27655905
Epigenetic_clocks_CpGs[Epigenetic_clocks_CpGs$DNAmWeidner %in% EWAS_hits$EWAS_hits,]$DNAmWeidner## factor(0)
## Levels: cg02228185 cg17861230 cg25809905
DNAmTL is a methylation estimator for telomere length based on 140CpGs. DNAmTL is said to be not only an epigenetic biomarker of replicative history of cells, but also a useful marker for age-related pathologies that are associated with it.
This script is used to do telomere length prediction based on Illumina 450K and EPIC DNA methylation data. Coefficients of the predictor are based on training samples.
#Subset relevant epigenetic age variables.
AMD_Epigenetic_DNAmTL <- AMD_pData[, c("Sample_Name", "Sample_Group", "Array", "Slide", "Sex", "predictedGender", "Tissue", "predictedTissue", "Age", "DNAmTL")] #Create function to plot linear regression.
ggplotRegression <- function (fit) {
require(ggplot2)
ggplot(fit$model, aes_string(x = names(fit$model)[2], y = names(fit$model)[1])) +
geom_point(aes(colour = AMD_Epigenetic_DNAmTL$Sex, shape = AMD_Epigenetic_DNAmTL$Sample_Group)) +
stat_smooth(method = "lm", col = "red")
}
#Regression plot of Zhang clock predicted DNAmAge against reported age:
AMD_DNAmTL_plot <- lm(DNAmTL ~ Age, data = AMD_Epigenetic_DNAmTL)
ggplotRegression(AMD_DNAmTL_plot) + labs(x = "Reported age", y = "DNAm telomere length", caption = "Correlation between chronological age and predicted telomere length") +
scale_colour_manual(name = "Sex", values = c("#990000", "#2c7dab")) +
scale_shape_manual(name = "Disease State", values = c(20, 23), labels = c("AMD", "Normal")) +
stat_cor(method = "spearman") +
theme_bw() + theme(panel.border = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.line = element_line(colour = "black"),
axis.text = element_text(),
axis.title = element_text(size = 8),
plot.caption = element_text(hjust = 0.5, size = rel(1)))## `geom_smooth()` using formula 'y ~ x'
#Plot of overall disease state vs control:
AMD_Epigenetic_DNAmTL$Sample_Group <- factor(AMD_Epigenetic_DNAmTL$Sample_Group)
ggplot(data = AMD_Epigenetic_DNAmTL,
aes(x = Sample_Group, y = DNAmTL, fill = Sample_Group)) +
geom_boxplot(position = position_dodge(0.7), width = 0.5, outlier.shape = NA) +
geom_jitter(aes(fill = Sample_Group), size = 2, shape = 21, alpha = 0.7,
position = position_jitterdodge(dodge.width = 0.75, jitter.width = 0.3)) +
xlab("Disease State") + ylab("DNAmTL Prediction") +
scale_x_discrete(breaks = c("age-related macular degeneration","normal"),
labels = c("Age-Related Macular Degeneration", "Normal")) +
scale_fill_manual(values = c("#77284e", "#ffffff")) +
theme_classic() + theme(
panel.border = element_blank(),
panel.background = element_blank(),
plot.title = element_text(size = 14, family = "Tahoma", face = "bold"),
text = element_text(family = "Tahoma"),
axis.title = element_text(face = "bold"),
axis.text.x = element_text(colour = "black", size = 11),
axis.text.y = element_text(colour = "black", size = 9),
axis.line = element_line(size = 0.5, colour = "black"),
legend.position = "none")Chen Y et al. (2013) Discovery of cross-reactive probes and polymorphic CpGs in the Illumina Infinium HumanMethylation450 microarray. Epigenetics.
Heiss J & Just JC (2019) Improved filtering of DNA methylation microarray data by detection p values and its impact on downstream anayses. Clinical Epigenetics.
Heiss J & Just JC (2018) Identifying mislabeled and contaminated DNA methylation microarray data: an extended quality control toolset with examples from GEO. Clinial Epigenetics.